In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

### Train Classification Model
Classes: 0, 1, 2, 3 - relevance score provided by dataset, with 3 being the highest and 0 being the lowest

#### Create dataset to use for training
1. Start with X = cosine similarity and y = relvance score
2. Later add more features to X (code and docstring embeddings, results from static analysis?)

In [10]:
df = pd.read_pickle('data/sim.pickle')
df = df[['split', 'sim','code_embed', 'nl_embed', 'relevance']]
df = df.dropna()
df.head()

Unnamed: 0,split,sim,code_embed,nl_embed,relevance
0,train,0.685977,"[[0.049218524, 0.046266314, 0.05542605, 0.0503...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
1,train,0.469626,"[[0.050279554, 0.048305128, 0.046424236, 0.054...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
2,train,0.49025,"[[0.049495775, 0.04716861, 0.055548936, 0.0603...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
3,train,0.568199,"[[0.04833452, 0.0510515, 0.056628924, 0.054732...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3
4,train,0.166293,"[[0.05180261, 0.053425696, 0.05358086, 0.04882...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3


In [17]:
df['code_avg'] = df.code_embed.apply(lambda e: e.mean())
df['nl_avg'] = df.nl_embed.apply(lambda e: e.mean())

df = df.drop(columns=['code_embed', 'nl_embed'])

df.head()

Unnamed: 0,split,sim,relevance,code_avg,nl_avg
0,train,0.685977,2,0.051551,0.024661
1,train,0.469626,2,0.052926,0.010496
2,train,0.49025,2,0.054815,0.012249
3,train,0.568199,3,0.054895,0.017146
4,train,0.166293,3,0.056132,0.001532


In [20]:
train_df = df[df.split == 'train'].drop('split', axis = 1)
test_df = df[df.split == 'test'].drop('split', axis = 1)

test_df.head()

Unnamed: 0,sim,relevance,code_avg,nl_avg
291,0.384595,1,0.052668,0.007921
292,0.777225,1,0.052795,0.030853
293,0.285888,2,0.051662,0.003478
294,0.171154,0,0.054695,0.001533
296,0.536571,0,0.052693,0.014822


In [24]:
X_train = train_df.drop('relevance', axis=1)
y_train = train_df.relevance.values

X_test = test_df.drop('relevance', axis=1)
y_test = test_df.relevance.values

X_train[:5]

Unnamed: 0,code_avg,nl_avg
0,0.051551,0.024661
1,0.052926,0.010496
2,0.054815,0.012249
3,0.054895,0.017146
4,0.056132,0.001532


#### Sample dataset to balance classes

In [26]:
smote = SMOTE()
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

Counter(Y_smote)

Counter({2: 112, 3: 112, 0: 112, 1: 112})

#### Train model

In [27]:
clf = LogisticRegression(multi_class='ovr')

clf.fit(X_smote, Y_smote)

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=True))

[[0 0 0 3]
 [0 2 0 5]
 [0 0 0 1]
 [0 1 0 1]]
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         3
           1       0.67      0.29      0.40         7
           2       1.00      0.00      0.00         1
           3       0.10      0.50      0.17         2

    accuracy                           0.23        13
   macro avg       0.69      0.20      0.14        13
weighted avg       0.68      0.23      0.24        13

