In [1]:
!pip install transformers



In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import shutil
import string
import tensorflow as tf
from keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input, Dropout, Bidirectional, LSTM, Embedding, BatchNormalization,  Reshape, Conv2D, MaxPool2D, concatenate, Flatten, Activation

To preprocess the data set and load it into a data frame, we will work with a CSV file consisting of five columns. The structure of the CSV file is as follows:

- Column 1: Identification of the programming task with loops.
- Column 2: Description of the problem to be solved.
- Column 3: Python solution provided for the programming task.
- Column 4: Definition of the initial state of the loop, where the initial variable declaration is located.
- Column 5: Definition of the final state of the loop, where the loop condition is defined.
- Column 6: Definition of the loop state transformation or also known as loop body.
- Column 7: Task status. It is denoted as 1 if the solution is correct and 0 if it is incorrect.
- Column 8: State of the task considering error types: Initialization State, Final State and State Transformation.
Column 9: Feedback Generated for Incorrect Exercises.

To load and preprocess the data set into a data frame, we will follow these steps:

- Import the libraries necessary for data manipulation, such as pandas.
- Use the pandas library to read the CSV file into a data frame, assuming the file is called "dataset.csv".

In [3]:
archivo_3 = '/content/drive/MyDrive/Ginna Tesis/Kernel Function Comparision/DataProgramsandDescriptions-CatRetroalimentacion6000.xlsx'
train_full = pd.read_excel(archivo_3)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Safari/DataProgramsandDescriptions-CatRetroalimentacion6000.xlsx'

In [None]:
train_full

Removal of Unnecessary Columns for Binary Classification Model Experimentation.

In [None]:
train_full.drop(["No.","Realimentación"],axis=1,inplace=True)

In [None]:
train_full.dropna(inplace=True)

In [None]:
train_full

This line of code retrieves unique values from the 'Etiqueta 2' column in the dataset, providing a valuable means to gather insights about the different categories present in this column. In this specific instance, the classifications include "Correct," as well as various types of errors in "Initial State," "End State," and "State Transformation," along with combinations of these errors. In total, there are eight distinct classes.

In [None]:
np.unique(train_full['Etiqueta 2'])

Prior to processing the Python source code, it is imperative to eliminate extraneous information from this dataset. Specifically, we need to remove the commas (,) that separate the content within the columns denoting the initial state, final state, and state transformation of the loop.

In [None]:
train_full['Estado incial']=train_full['Estado incial'].apply(lambda w:w.replace(',','\n'))
train_full['Estado final']=train_full['Estado final'].apply(lambda w:w.replace(',','\n'))
train_full['Transformación de estado']=train_full['Transformación de estado'].apply(lambda w:w.replace(',','\n'))

### 1.3. Functions for tokenization and embedding in descriptions and codes.

Once the dataset is imported into the data frame, we can move forward with generating embeddings using two distinct functions: "tokenize_and_generate_embeddings_descriptions" for processing problem descriptions and "tokenize_and_generate_embeddings_codes" for handling the provided Python solutions.

For problem descriptions, we tokenize and create text embeddings by utilizing a pre-trained BERT model designed for English text.

Conversely, in tokenizing the Python source code, we will employ a CodeBERT model that has been pre-trained for this purpose. Subsequently, we will generate embeddings through the utilization of this specialized model.

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Verificar si la GPU está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el tokenizer y el modelo en la GPU si está disponible
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
model.to(device)

# Define function to generate embeddings for English descriptions
def tokenize_and_generate_embeddings_descriptions(description):
    # Tokenize input description and move tokens to GPU
    tokens = tokenizer.encode_plus(description, return_tensors="pt", truncation=True)
    tokens = {key: value.to(device) for key, value in tokens.items()}

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings for all tokens
    desc_embeddings = outputs.last_hidden_state.cpu().numpy()

    return desc_embeddings

In [None]:
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaModel

# Verificar si la GPU está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el tokenizer y el modelo en la GPU si está disponible
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.to(device)

# Define function to generate embeddings for Python code solutions
def tokenize_and_generate_embeddings_codes(code):
    # Tokenize input code and move tokens to GPU
    tokens = tokenizer.encode_plus(code, return_tensors="pt", truncation=True)
    tokens = {key: value.to(device) for key, value in tokens.items()}

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings for all tokens
    code_embeddings = outputs.last_hidden_state.cpu().numpy()

    return code_embeddings

In [None]:
%%time
# Let's generate the embeddings for problem descriptions and Python code solutions.
problem = train_full['Problema'].apply(tokenize_and_generate_embeddings_descriptions).to_numpy()
code = train_full['Solución'].apply(tokenize_and_generate_embeddings_codes).to_numpy()
startstate = train_full['Estado incial'].apply(tokenize_and_generate_embeddings_codes).to_numpy()
finalstate = train_full['Estado final'].apply(tokenize_and_generate_embeddings_codes).to_numpy()
transstate = train_full['Transformación de estado'].apply(tokenize_and_generate_embeddings_codes).to_numpy()


In [None]:
problem.shape,startstate.shape,finalstate.shape,transstate.shape

In [None]:
print(startstate.shape)
print(startstate[1262].shape)


#### 1.3.1. Average vectors from problem, code, initial state, final state, and transform state.

This code calculates average vectors from the lists of vectors contained in problem, code, startstate, finalstate, and transstate, and then stores these average vectors in NumPy arrays. The shapes of these arrays are printed at the end.

In [None]:
Xp=np.array([sentence[0].mean(axis=0) for sentence in problem])
Xc=np.array([sentence[0].mean(axis=0) for sentence in code])
Xs=np.array([sentence[0].mean(axis=0) for sentence in startstate])
Xf=np.array([sentence[0].mean(axis=0) for sentence in finalstate])
Xt=np.array([sentence[0].mean(axis=0) for sentence in transstate])
Xp.shape,Xc.shape,Xs.shape,Xf.shape,Xt.shape

##### 1.3.2. Concatenate arrays.

The code aims to concatenate the arrays Xp, Xc, Xs, Xt, and Xf along axis 1, requiring them to have compatible dimensions along this specified axis.

In [None]:
X=np.concatenate([Xp,Xc,Xs,Xt,Xf],axis=1)
Xs.shape,X.shape,768*5

### 1.4. Labels for Categorical Task Status Classification.

In the function, task status labels are assigned numerical values as follows: 0 denotes "success," 1 represents an incorrect task with an error in the "initial state," 2 indicates an incorrect task with an error in the "final state," and 3 signifies an incorrect task with an error in "state transformation." States 4, 5, and 6 encompass possible combinations of errors. These labels are organized within a vector.

To provide additional clarity, here is the coding scheme for each task state label:

- ['Correct'] 0
- ['Initial state'] 1
- ['Final state'] 2
- ['State transformation'] 3
- ['Initial state', 'Final state'] 4
- ['Initial state', 'State transformation'] 5
- ['Final state', 'State transformation'] 6
- ['Initial state', 'Final state', 'State transformation'] 7

After storing the labels in a list, we flatten the list to ensure a unified representation for further analysis and model training. This allows for seamless integration with various algorithms and operations.

In [None]:
def categoricallabelAll(w):
  if w=="['Correct']":
    return 0
  if w=="['Initial state']":
    return 1
  if w=="['Final state']":
    return 2
  if w=="['State transformation']":
    return 3
  if w=="['Initial state', 'Final state']":
    return 4
  if w=="['Initial state', 'State transformation']":
    return 5
  if w=="['Final state', 'State transformation']":
    return 6
  if w=="['Initial state', 'Final state', 'State transformation']":
    return 7
  return 8

category=np.array([
    'Correct',
    'Initial state',
    'Final state',
    'State transformation',
    'Initial state, Final state',
    'Initial state, State transformation',
    'Final state, State transformation',
    'Initial state, Final state, State transformation'

])

### 1.4. Labels for Categorical Task Status Classification.

In [None]:
y=train_full['Etiqueta 2'].apply(categoricallabelAll)
np.unique(y)

In [None]:
y[np.isnan(y)]

## Part 2.

### 2.1. Training and testing data.

This code uses the train_test_split function from the Scikit-Learn (sklearn) library to split two datasets, X and y, into training and testing sets. The training data is used to train the model, while the testing data is used to evaluate its predictive performance.

In [None]:
df_save=pd.DataFrame(X)
df_save["Class"]=y

In [None]:
df_save

In [None]:
#df_save.to_excel("/content/drive/MyDrive/Ginna Tesis/Kernel Function Comparision/finaldataset.xlsx")

In [None]:
#df_save=pd.read_excel("/content/drive/MyDrive/Ginna Tesis/Kernel Function Comparision/finaldataset.xlsx")

In [None]:
# df_save.pop("Unnamed: 0")
# df_save

In [None]:
#X=df_save.iloc[:,0:-1]
#y=df_save.iloc[:,-1]


In [None]:
X

In [None]:
y

In [None]:
# X=X.to_numpy()
y=y.to_numpy()

# Kernel model

In [None]:
!pip install sklearnkernels

In [None]:
from sklearnkernels.KSVM import KSVC
from sklearnkernels.KANN import KANNC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import  GridSearchCV, RandomizedSearchCV

In [None]:
# rq_params={"svc__kernel": ['rquadratic'], "svc__C": np.logspace(0,5,10),"svc__coef0" : np.logspace(-4,4,20),"svc__gamma" : ["auto"]}
# rbf_params={"svc__kernel": ['rbf'],"svc__C": np.logspace(0,5,10), "svc__gamma" : np.logspace(-4,4,20)}
# tru_params={"svc__kernel": ['tru'],"svc__C": np.logspace(0,5,10), "svc__gamma" : np.logspace(-4,4,20)}
# can_params={"svc__kernel": ['can'],"svc__C": np.logspace(0,5,10),"svc__gamma" : np.logspace(-4,4,20)}
# rb_params={"svc__kernel": ['radial_basic'],"svc__C": np.logspace(0,5,10),"svc__gamma" : np.logspace(-4,4,20)}
# tri_params={"svc__kernel": ['triangle'],"svc__C": np.logspace(0,5,10),"svc__gamma" : np.logspace(-4,4,20)}
# hp_params={"svc__kernel": ['hyperbolic'],"svc__C": np.logspace(0,5,10), "svc__gamma" : np.logspace(-4,4,20)}

# params=[rq_params,rbf_params,tru_params,can_params,rb_params,tri_params,hp_params]
# p_sts_ksvc = Pipeline([('sscaler', StandardScaler()), ('svc', KSVC())])
# p_sts_kann = Pipeline([('sscaler', StandardScaler()), ('svc', KANNC())])



In [None]:
# path='/content/drive/MyDrive/Ginna Tesis/kernel.csv'

In [None]:
# def random_searchFit(X,y,filename=None):
#   best_params=[]
#   for pipe in pipes:
#     print(pipe["name"])
#     for param in params:
#       print(param)
#       clf=RandomizedSearchCV(pipe["pipe"],param,cv=5, random_state=2021, n_jobs=-1)
#       clf.fit(X,y)
#       best_params.append({"pipe":pipe["name"],"best_params":clf.best_params_,"score":clf.best_score_, "cv_results":clf.cv_results_})
#       print(clf.best_score_)

#   lst_best_params=[]

#   for bp in  best_params:
#     mean_test_score=bp['cv_results']['mean_test_score']
#     mean_fit_time=bp['cv_results']['mean_fit_time']
#     mean_score_time=bp['cv_results']['mean_score_time']
#     std_test_score=bp['cv_results']['std_test_score']
#     std_fit_time=bp['cv_results']['std_fit_time']
#     std_score_time=bp['cv_results']['std_score_time']

#     i=np.argmax(mean_test_score)
#     lst_best_params.append({
#         'Scaler':bp['pipe'],
#         'kernel':bp['best_params']['svc__kernel'],
#         'mean_test_score':mean_test_score[i],
#         'std_test_score':std_test_score[i],
#         'mean_fit_time':mean_fit_time[i],
#         'std_fit_time':std_fit_time[i],
#         'mean_score_time':mean_score_time[i],
#         'std_score_time':std_score_time[i],
#         'best_param':bp['best_params']
#         })
#     print("save params:")
#     print(lst_best_params)
#     df_bp=pd.DataFrame(lst_best_params)
#     df_bp.to_csv(path)

In [None]:
# np.unique(y)

In [None]:
# pipes=[{"name":"SScalerKSVC","pipe":p_sts_ksvc}]
# path='/content/drive/MyDrive/Ginna Tesis/kernelSVC.csv'

In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=1/10,random_state=2023)
Xtrain,Xtest,ytrain,ytest=train_test_split(Xtest,ytest,test_size=0.2,random_state=2023)


In [None]:
X.shape,Xtrain.shape

In [None]:
# random_searchFit(Xtrain,ytrain)

In [None]:
# pipes=[{"name":"SScalerKANN","pipe":p_sts_kann}]
# path='/content/drive/MyDrive/Ginna Tesis/kernelANN.csv'

In [None]:
# random_searchFit(Xtrain,ytrain)

{'svc__kernel': 'rquadratic', 'svc__gamma': 'auto', 'svc__coef0': 3792.690190732246, 'svc__C': 27825.59402207126}
{'svc__kernel': 'rbf', 'svc__gamma': 0.0006951927961775605, 'svc__C': 2154.4346900318847}
{'svc__kernel': 'tru', 'svc__gamma': 4.281332398719396, 'svc__C': 12.91549665014884}
{'svc__kernel': 'can', 'svc__gamma': 0.23357214690901212, 'svc__C': 7742.636826811277}
{'svc__kernel': 'radial_basic', 'svc__gamma': 0.23357214690901212, 'svc__C': 7742.636826811277}
{'svc__kernel': 'triangle', 'svc__gamma': 545.5594781168514, 'svc__C': 46.4158883361278}
{'svc__kernel': 'hyperbolic', 'svc__gamma': 0.0001, 'svc__C': 1.0}


In [None]:
ksvc_rq=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='rquadratic',gamma='auto',coef0=3792.690190732246,C=27825.59402207126))
])

ksvc_rbf=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='rbf',gamma=0.0006951927961775605,C=2154.4346900318847))
])

ksvc_tr=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='tru',gamma=4.281332398719396,C=12.91549665014884))
])

ksvc_can=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='can',gamma=0.23357214690901212,C=7742.636826811277))
])

ksvc_rb=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='radial_basic',gamma=0.23357214690901212,C=7742.636826811277))
])

ksvc_tri=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='triangle',gamma=545.5594781168514,C=46.4158883361278))
])

ksvc_hp=Pipeline([
    ('sscaler', StandardScaler()),
    ('svc', KSVC(kernel='hyperbolic',gamma=0.0001,C=1))
])

knames=['rquadratic','rbf','tru','can','radial_basic','triangle','hyperbolic']
svc_models=[ksvc_rq,ksvc_rbf,ksvc_tr,ksvc_can,ksvc_rb,ksvc_tri,ksvc_hp]

In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=1/5,random_state=2023)
Xtrain,Xtest,ytrain,ytest=train_test_split(Xtest,ytest,test_size=0.2,random_state=2023)

In [None]:
scores=[]
for model in svc_models:
  model.fit(Xtrain,ytrain)
  scores.append(model.score(Xtest,ytest))

In [None]:
knames,scores