In [55]:
import pandas as pd
import numpy as np
import researchpy as rp
from operator import itemgetter

from sklearn.utils import compute_sample_weight
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [56]:
def cols_with_null(df):
    sum_nan = df.isnull().sum().to_frame()
    sum_nan.columns=['#NaN']
    sum_nan_thresh = sum_nan[sum_nan['#NaN'] > 0]
    pd.set_option('display.max_rows', None)
    print(sum_nan_thresh)

In [57]:
import pickle5 as pickle

file_path = './T5/df_fe_embeded_by_line_base.pkl'
with open(file_path, "rb") as fh:
  df = pickle.load(fh)

In [58]:
df.head()

Unnamed: 0,sample_id,severity,method,label,embeded_sequence_sum,embeded_sequence_avg,embeded_sequence
0,4256584,major,"public void send(byte[] data, int length, ...",1,"[0.21787217, -1.0906152, -0.608323, 1.0227113,...","[0.024208019, -0.12117946, -0.06759144, 0.1136...","[0.18041626, -0.12819904, -0.36536545, 0.17805..."
1,8922371,major,public void read(org.apache.thrift.protoco...,1,"[-10.9871435, -7.1662316, -9.392925, 6.0778, -...","[-0.16647187, -0.10857927, -0.14231706, 0.0920...","[0.15779866, 0.1421715, -0.42044684, 0.2931572..."
2,8653310,major,@Override public Iterator<Row> getRows(Ses...,1,"[-3.2137842, -13.560189, -20.947268, 14.573725...","[-0.036520276, -0.15409306, -0.23803712, 0.165...","[0.2592528, 0.17330371, -0.34510717, 0.0470411..."
3,4734605,major,private void finishRestore(final Timer.Conte...,1,"[-2.54196, -4.40375, -6.3736634, 2.5665374, 1....","[-0.08765379, -0.15185344, -0.2197815, 0.08850...","[0.1814259, 0.002315736, -0.36206213, 0.060624..."
4,4514232,major,public synchronized void start(BundleConte...,1,"[-8.150435, -12.652619, -16.6453, 15.762429, 1...","[-0.0768909, -0.119364336, -0.15703113, 0.1487...","[0.22148372, 0.0054409914, -0.40012845, 0.1510..."


In [59]:
embeded_sum = df.loc[:, ['sample_id', 'severity', 'label', 'embeded_sequence_sum']]
embeded_avg = df.loc[:, ['sample_id', 'severity', 'label', 'embeded_sequence_avg']]
embeded_sum.head()

Unnamed: 0,sample_id,severity,label,embeded_sequence_sum
0,4256584,major,1,"[0.21787217, -1.0906152, -0.608323, 1.0227113,..."
1,8922371,major,1,"[-10.9871435, -7.1662316, -9.392925, 6.0778, -..."
2,8653310,major,1,"[-3.2137842, -13.560189, -20.947268, 14.573725..."
3,4734605,major,1,"[-2.54196, -4.40375, -6.3736634, 2.5665374, 1...."
4,4514232,major,1,"[-8.150435, -12.652619, -16.6453, 15.762429, 1..."


In [60]:
n = len(embeded_avg.embeded_sequence_avg[0])
n

768

In [61]:
columns = [f'em_{i+1}' for i in range(n)]
data = pd.DataFrame(embeded_sum["embeded_sequence_sum"].to_list(), columns=columns)
data['label'] = embeded_sum['label']
data['sample_id'] = embeded_sum['sample_id']
data['severity'] = embeded_sum['severity']
data.head()

Unnamed: 0,em_1,em_2,em_3,em_4,em_5,em_6,em_7,em_8,em_9,em_10,...,em_762,em_763,em_764,em_765,em_766,em_767,em_768,label,sample_id,severity
0,0.217872,-1.090615,-0.608323,1.022711,0.28768,1.277133,0.177108,0.809881,-0.007368,-0.463337,...,0.278133,1.511911,-1.294661,2.474487,-1.612359,3.926413,-2.790477,1,4256584,major
1,-10.987144,-7.166232,-9.392925,6.0778,-2.039392,8.447851,-1.056991,8.644148,0.712731,-6.017998,...,0.956027,12.716941,-2.651685,8.036318,3.19151,16.33943,-7.412573,1,8922371,major
2,-3.213784,-13.560189,-20.947268,14.573725,-6.044149,19.910698,8.801538,14.653162,1.38727,-7.470222,...,-2.271558,25.379471,-11.244059,11.318339,5.528104,25.563402,-30.205057,1,8653310,major
3,-2.54196,-4.40375,-6.373663,2.566537,1.11694,5.72269,2.058697,3.878052,-0.569363,-2.651576,...,1.587386,6.662918,-4.786423,3.659157,-1.920642,11.262209,-6.217933,1,4734605,major
4,-8.150435,-12.652619,-16.6453,15.762429,1.747681,14.453441,7.659223,11.01408,2.128191,-16.728395,...,-5.639894,30.726349,-15.064228,14.607963,4.813145,30.461773,-13.574982,1,4514232,major


In [62]:
dataset_path = './embedded_datasets/T5_base_line_sum.pkl'
pd.to_pickle(data, dataset_path)

In [63]:
columns = [f'em_{i+1}' for i in range(n)]
data = pd.DataFrame(embeded_avg["embeded_sequence_avg"].to_list(), columns=columns)
data['label'] = embeded_avg['label']
data['sample_id'] = embeded_avg['sample_id']
data['severity'] = embeded_avg['severity']
data.head()

Unnamed: 0,em_1,em_2,em_3,em_4,em_5,em_6,em_7,em_8,em_9,em_10,...,em_762,em_763,em_764,em_765,em_766,em_767,em_768,label,sample_id,severity
0,0.024208,-0.121179,-0.067591,0.113635,0.031964,0.141904,0.019679,0.089987,-0.000819,-0.051482,...,0.030904,0.16799,-0.143851,0.274943,-0.179151,0.436268,-0.310053,1,4256584,major
1,-0.166472,-0.108579,-0.142317,0.092088,-0.0309,0.127998,-0.016015,0.130972,0.010799,-0.091182,...,0.014485,0.192681,-0.040177,0.121762,0.048356,0.247567,-0.112312,1,8922371,major
2,-0.03652,-0.154093,-0.238037,0.165611,-0.068684,0.226258,0.100017,0.166513,0.015764,-0.084889,...,-0.025813,0.288403,-0.127773,0.128617,0.062819,0.290493,-0.343239,1,8653310,major
3,-0.087654,-0.151853,-0.219782,0.088501,0.038515,0.197334,0.07099,0.133726,-0.019633,-0.091434,...,0.054737,0.229756,-0.165049,0.126178,-0.066229,0.388352,-0.214411,1,4734605,major
4,-0.076891,-0.119364,-0.157031,0.148702,0.016488,0.136353,0.072257,0.103906,0.020077,-0.157815,...,-0.053207,0.289871,-0.142115,0.137811,0.045407,0.287375,-0.128066,1,4514232,major


In [64]:
dataset_path = './embedded_datasets/T5_base_line_avg.pkl'
pd.to_pickle(data, dataset_path)

### Method embedding

In [65]:
import pickle5 as pickle

file_path = './T5/df_fe_embeded_base.pkl'
with open(file_path, "rb") as fh:
  df = pickle.load(fh)

print(len(df))
df.head()

2242


Unnamed: 0,sample_id,severity,method,label,embeded_sequence_sum,embeded_sequence_avg,embeded_sequence
0,4256584,major,"public void send(byte[] data, int length, ...",1,"[0.21787217, -1.0906152, -0.608323, 1.0227113,...","[0.024208019, -0.12117946, -0.06759144, 0.1136...","[-0.004100991, -0.03969452, -0.25590736, 0.024..."
1,8922371,major,public void read(org.apache.thrift.protoco...,1,"[-10.9871435, -7.1662316, -9.392925, 6.0778, -...","[-0.16647187, -0.10857927, -0.14231706, 0.0920...","[-0.017931785, -0.3142759, -0.24883649, 0.2399..."
2,8653310,major,@Override public Iterator<Row> getRows(Ses...,1,"[-3.2137842, -13.560189, -20.947268, 14.573725...","[-0.036520276, -0.15409306, -0.23803712, 0.165...","[0.021195134, -0.2461163, -0.284268, 0.1083875..."
3,4734605,major,private void finishRestore(final Timer.Conte...,1,"[-2.54196, -4.40375, -6.3736634, 2.5665374, 1....","[-0.08765379, -0.15185344, -0.2197815, 0.08850...","[-0.10045837, -0.22156432, -0.25885403, 0.1849..."
4,4514232,major,public synchronized void start(BundleConte...,1,"[-8.150435, -12.652619, -16.6453, 15.762429, 1...","[-0.0768909, -0.119364336, -0.15703113, 0.1487...","[-0.039614994, -0.3100464, -0.31415808, 0.2212..."


In [66]:
cols_with_null(df)

                  #NaN
embeded_sequence    12


In [67]:
df = df.dropna()
len(df)

2230

In [69]:
n = len(df.embeded_sequence[0])
columns = [f'em_{i+1}' for i in range(n)]
df[columns] = df["embeded_sequence"].to_list()

In [70]:
df.drop(columns=['method', 'embeded_sequence', 'embeded_sequence_sum', 'embeded_sequence_avg'], inplace=True)
df.head()

Unnamed: 0,sample_id,severity,label,em_1,em_2,em_3,em_4,em_5,em_6,em_7,...,em_759,em_760,em_761,em_762,em_763,em_764,em_765,em_766,em_767,em_768
0,4256584,major,1,-0.004101,-0.039695,-0.255907,0.024454,0.090049,0.249496,0.176672,...,-0.036493,0.085672,0.134634,0.119191,0.228669,0.006403,0.387788,0.100221,0.212947,-0.207011
1,8922371,major,1,-0.017932,-0.314276,-0.248836,0.239944,0.106475,0.044212,0.13249,...,-0.12284,0.096871,-0.017994,0.074335,0.080003,-0.022505,0.22396,0.057752,0.231872,0.038653
2,8653310,major,1,0.021195,-0.246116,-0.284268,0.108388,0.055387,0.304153,0.219867,...,-0.107084,-0.085128,0.00371,0.011389,0.38686,-0.113908,0.356603,-0.044271,0.247136,-0.117098
3,4734605,major,1,-0.100458,-0.221564,-0.258854,0.184937,0.143817,0.153768,0.120826,...,-0.076337,-0.091633,0.011502,0.120809,0.143588,-0.156842,0.327236,0.018643,0.280628,-0.086995
4,4514232,major,1,-0.039615,-0.310046,-0.314158,0.221212,0.045569,0.085394,0.118162,...,-0.145679,0.00325,-0.062364,0.139016,0.226858,-0.128519,0.247574,-0.038725,0.336489,-0.025836


In [71]:
dataset_path = './embedded_datasets/T5_base.pkl'
pd.to_pickle(df, dataset_path)