In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import researchpy as rp
import numpy as np
import datetime
from sklearn.utils import compute_sample_weight
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [24]:
data_path = './CuBERT/fe_cubert_avg.pkl'
cubert_df = pd.read_pickle(data_path)

cubert_df['sample_id'] = [name.split('.')[0] for name in cubert_df.name.values]
cubert_df['sample_id'] = cubert_df['sample_id'].astype(int)
cubert_df.drop(columns=['name', 'label'], inplace=True)

cubert_df.head(10)

Unnamed: 0,embedding,sample_id
0,"[-0.05614383791883786, 0.06216126208504041, 0....",3698323
1,"[0.41810743790119886, 0.1511763632297516, 0.39...",3699849
2,"[-0.04154935077979015, 0.08485619573002964, 0....",3702984
3,"[-0.10684733948594816, 0.06884372122334065, 0....",3705164
4,"[-0.12888911486039928, 0.0778285141022485, 0.5...",3710201
5,"[[-0.10964469941424541, 0.1887121375222675, 0....",3711605
6,"[-0.013053571184476217, 0.1543080136179924, 0....",3717506
7,"[0.03741505186333031, 0.09719843585349497, 0.3...",3721456
8,"[[-0.06871621487962623, 0.032618473389095834, ...",3725605
9,"[[-0.07038389176791192, 0.0691476946381035, 0....",3732429


In [25]:
df = pd.read_csv('./data_class.csv')
df.drop(columns=['method'], inplace=True)
df.head(10)

Unnamed: 0,sample_id,severity
0,4432196,major
1,7391055,major
2,4687786,major
3,3797964,major
4,5339993,major
5,7689561,major
6,5589404,major
7,8968865,major
8,6603072,major
9,7134420,major


In [26]:
cubert_df = pd.merge(cubert_df, df, how='inner')
cubert_df['embedding'] = [x if len(x) == 1024 else x[0] for x in cubert_df.embedding]
cubert_df.head(10)

Unnamed: 0,embedding,sample_id,severity
0,"[-0.05614383791883786, 0.06216126208504041, 0....",3698323,none
1,"[0.41810743790119886, 0.1511763632297516, 0.39...",3699849,none
2,"[-0.04154935077979015, 0.08485619573002964, 0....",3702984,none
3,"[-0.10684733948594816, 0.06884372122334065, 0....",3705164,none
4,"[-0.12888911486039928, 0.0778285141022485, 0.5...",3710201,none
5,"[-0.10964469941424541, 0.1887121375222675, 0.2...",3711605,none
6,"[-0.013053571184476217, 0.1543080136179924, 0....",3717506,none
7,"[0.03741505186333031, 0.09719843585349497, 0.3...",3721456,none
8,"[-0.06871621487962623, 0.032618473389095834, 0...",3725605,none
9,"[-0.07038389176791192, 0.0691476946381035, 0.3...",3732429,none


In [27]:
cubert_df['label'] = np.where(cubert_df.severity == 'none', 0, 1)

In [28]:
from operator import itemgetter

n = len(cubert_df.embedding[0])
print(n)

columns = [f'em_{i+1}' for i in range(n)]
cubert_df[columns] = cubert_df["embedding"].to_list()

cubert_df.drop(columns=['embedding'], inplace=True)
cubert_df.head()

cubert_df.to_pickle('./embedded_datasets/cubert_embedding_avg.pkl')

1024


In [29]:
cubert_df.head()

Unnamed: 0,sample_id,severity,label,em_1,em_2,em_3,em_4,em_5,em_6,em_7,...,em_1015,em_1016,em_1017,em_1018,em_1019,em_1020,em_1021,em_1022,em_1023,em_1024
0,3698323,none,0,-0.056144,0.062161,0.176291,-0.114446,-0.389548,0.059869,0.453178,...,0.107001,-0.050882,-0.259201,-0.054166,0.11771,0.153722,0.031202,-0.219465,0.067949,-0.543244
1,3699849,none,0,0.418107,0.151176,0.391457,-0.083568,-0.662845,0.546049,0.47501,...,0.065242,-0.152,-0.002404,-0.336428,-0.032583,0.269119,-0.252046,-0.483657,0.180706,-0.622255
2,3702984,none,0,-0.041549,0.084856,0.284307,-0.240086,-0.501371,0.433321,0.472064,...,-0.10296,-0.147274,-0.167974,-0.270716,0.077317,0.186185,-0.178721,-0.295359,0.266334,-0.556279
3,3705164,none,0,-0.106847,0.068844,0.393554,-0.294914,-0.477019,0.360863,0.262062,...,-0.114485,-0.2093,-0.029854,-0.236748,-0.176676,0.286778,-0.051387,-0.161994,0.156257,-0.33026
4,3710201,none,0,-0.128889,0.077829,0.509433,-0.430482,-0.350971,0.436374,0.11623,...,-0.093216,-0.334977,0.032514,-0.412093,-0.287623,0.425673,0.024389,-0.414011,0.186636,-0.323789
