In [1]:
import pandas as pd
import researchpy as rp
import numpy as np

### Dataset

In [2]:
df = pd.read_csv('feature_envy.csv')
df['label'] = np.where(df.severity == 'none', 0, 1)
df.head()

Unnamed: 0,sample_id,severity,method,label
0,4256584,major,"public void send(byte[] data, int length, ...",1
1,8922371,major,public void read(org.apache.thrift.protoco...,1
2,8653310,major,@Override public Iterator<Row> getRows(Ses...,1
3,4734605,major,private void finishRestore(final Timer.Conte...,1
4,4514232,major,public synchronized void start(BundleConte...,1


In [3]:
rp.summary_cat(df["severity"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,severity,none,2176,97.06
1,,minor,45,2.01
2,,major,20,0.89
3,,critical,1,0.04


In [4]:
rp.summary_cat(df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### Metrics

In [5]:
data_path = 'embedded_datasets/metrics_dataset.pkl'
metrics_df = pd.read_pickle(data_path)    
rp.summary_cat(metrics_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


In [6]:
metrics_df.head()

Unnamed: 0,sample_id,severity,constructor,line,cbo,wmc,rfc,loc,returnsQty,variablesQty,...,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,hasJavaDoc,from_project,label
0,4256584,major,0,65,1,1,5,7,0,0,...,0,0,0,0,32,1,0,1,1,1
1,8922371,major,0,733,3,15,14,71,0,1,...,3,0,0,0,5,1,0,0,1,1
2,8653310,major,0,105,10,7,73,24,1,6,...,3,0,0,0,32,1,0,1,1,1
3,4734605,major,0,929,6,4,28,20,0,6,...,1,0,0,1,39,2,2,0,0,1
4,4514232,major,0,100,15,23,19,65,0,20,...,3,0,0,0,58,33,0,0,1,1


### T5 base

In [7]:
data_path = 'embedded_datasets/T5_base.pkl'
t5_base_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_base_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2167,97.17
1,,1,63,2.83


### T5 small

In [8]:
data_path = 'embedded_datasets/T5_small.pkl'
t5_small_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_small_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2174,97.05
1,,1,66,2.95


### T5 base - line by line avg

In [10]:
data_path = 'embedded_datasets/T5_base_line_avg.pkl'
t5_base_avg_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_base_avg_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### T5 base - line by line sum

In [11]:
data_path = 'embedded_datasets/T5_base_line_sum.pkl'
t5_base_sum_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_base_sum_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### T5 small - line by line avg

In [12]:
data_path = 'embedded_datasets/T5_small_line_avg.pkl'
t5_small_avg_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_small_avg_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### T5 small - line by line sum

In [13]:
data_path = 'embedded_datasets/T5_small_line_sum.pkl'
t5_small_sum_df = pd.read_pickle(data_path)    
rp.summary_cat(t5_small_sum_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### CuBERT sum

In [14]:
data_path = 'embedded_datasets/cubert_embedding_sum.pkl'
cubert_sum_df = pd.read_pickle(data_path)    
rp.summary_cat(cubert_sum_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


### CuBERT avg

In [15]:
data_path = 'embedded_datasets/cubert_embedding_avg.pkl'
cubert_avg_df = pd.read_pickle(data_path)    
rp.summary_cat(cubert_sum_df["label"])

Unnamed: 0,Variable,Outcome,Count,Percent
0,label,0,2176,97.06
1,,1,66,2.94


In [16]:
cubert_sum_df.head()

Unnamed: 0,sample_id,severity,label,em_1,em_2,em_3,em_4,em_5,em_6,em_7,...,em_1015,em_1016,em_1017,em_1018,em_1019,em_1020,em_1021,em_1022,em_1023,em_1024
0,3698602,none,0,-1.723899,-2.737745,1.469403,-0.548004,2.109866,-1.51092,-3.649564,...,1.193711,-3.885593,-3.076951,-0.482897,-3.487826,1.360949,1.65522,-2.342658,-1.754574,3.59616
1,3698665,none,0,-0.127359,0.191389,0.83627,-0.531719,0.899578,0.199277,-1.636374,...,-0.977939,-4.672865,-2.495437,-1.147227,-1.399549,1.422049,1.570305,-2.45796,-0.160255,1.486048
2,3698860,none,0,-11.290483,-11.93186,6.14151,-4.34774,7.238474,-4.671563,-17.403679,...,3.971375,-18.15435,-14.999578,1.667217,-16.647888,2.801434,4.387309,-10.451984,-5.2058,15.473181
3,3699227,none,0,0.131664,-1.043202,1.858572,0.032362,0.927056,-0.718811,-2.340928,...,-0.227229,-3.45202,-1.747002,-1.834627,-2.616351,1.137131,0.594619,-2.01038,-0.782919,1.65542
4,3699521,none,0,4.627614,0.167773,-1.053386,2.192144,0.712373,-1.742896,4.117256,...,0.530793,-8.905686,-1.238471,-0.262763,0.403701,-7.027378,0.470825,-4.626226,2.534981,-5.569468


### Train - test split

In [17]:
from sklearn.model_selection import train_test_split

data = t5_base_df
y = data.label

for rand_state in range(51):
    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(data, y, test_size=0.2, shuffle=True, stratify=y, random_state=rand_state)
    X_test_df[['label', 'sample_id']].to_csv('data_splits/y_test_' + str(rand_state) + '.csv', index=False)
    X_train_df[['label', 'sample_id']].to_csv('data_splits/y_train_' + str(rand_state) + '.csv', index=False)