## Training Set Balancing

### 1. Importing Tokenized Training-Set CSV

In [3]:
import pandas as pd
import json

opc = pd.read_csv('./split_tt/train.csv')
opc['opcode'] = opc['opcode'].apply(lambda x: json.loads(x))

opc

Unnamed: 0,opcode,swc_label
0,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
1,"[2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...",0
2,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
3,"[2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...",0
4,"[2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...",1
...,...,...
13995,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0
13996,"[2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...",1
13997,"[2, 2, 10, 50, 12, 5, 13, 25, 62, 2, 30, 32, 1...",0
13998,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0


### 2. Applying Over-Sampling Techniques on the Training DataSet

### 2.A. SMOTE

In [5]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE using the SMOTE() constructor, supplying the Random-State hyper-parameter.
sm = SMOTE(random_state = 60)

# Slice the DataFrame into X (Features) and Y (Labels)
X, Y = pd.DataFrame(opc['opcode'].tolist(), index=opc['opcode'].index), opc['swc_label']
   
# Applying SMOTE
X_sm, Y_sm = sm.fit_resample(X, Y)

# Create New DataFrame to store Balanced Dataset
temp_df = pd.DataFrame()
temp_df['opcode'] = X_sm.values.tolist()
temp_df['swc_label'] = Y_sm.values.tolist()
    
# Creates CSV Files of Balanced Dataset(s)
temp_df.to_csv(f'./split_ds/opcode_sm_TRAIN.csv', index=False)
    
print('Done Applying SMOTE')

Done Applying SMOTE


### 2.B. SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek 

# Create an instance of SMOTETomek using the SMOTETomek() constructor, supplying the Random-State hyper-parameter. 
smt = SMOTETomek(random_state = 60)

# Slice the DataFrame into X (Features) and Y (Labels)
X, Y = pd.DataFrame(opc['opcode'].tolist(), index=opc['opcode'].index), opc['label']
   
# Applying SMOTETomek
X_smt, Y_smt = smt.fit_sample(X, Y)

# Create New DataFrame to store Balanced Dataset
temp_df = pd.DataFrame()
temp_df['opcode'] = X_smt.values.tolist()
temp_df['label'] = Y_smt.values.tolist()
    
# Creates CSV Files of Balanced Dataset(s)
temp_df.to_csv(f'./split_ds/opcode_smt_TRAIN.csv', index=False)
    
print('Done Applying SMOTETomek')

: 

### 2.C. ADASYN

In [None]:
from imblearn.over_sampling import ADASYN 

# Create an instance of ADASYN using the ADASYN() constructor, supplying the Random-State hyper-parameter. 
ada = ADASYN(random_state=60, sampling_strategy='minority')

# Slice the DataFrame into X (Features) and Y (Labels)
X, Y = pd.DataFrame(opc['opcode'].tolist(), index=opc['opcode'].index), opc['swc_label']
   
# Applying ADASYN
X_ada, Y_ada = ada.fit_sample(X, Y)

# Create New DataFrame to store Balanced Dataset
temp_df = pd.DataFrame()
temp_df['opcode'] = X_ada.values.tolist()
temp_df['swc_label'] = Y_ada.values.tolist()
    
# Creates CSV Files of Balanced Dataset(s)
temp_df.to_csv(f'./split_ds/opcode_ada_TRAIN.csv', index=False)
    
print('Done Applying ADASYN')

: 

## Test Set Balancing

### 1. Importing Tokenized Testing-Set CSV

In [6]:
opc_test = pd.read_csv('./split_tt/test.csv')
opc_test['opcode'] = opc_test['opcode'].apply(lambda x: json.loads(x))

opc_test

Unnamed: 0,opcode,swc_label
0,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0
1,"[2, 2, 10, 50, 12, 5, 13, 25, 2, 2, 23, 2, 30,...",1
2,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0
3,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 2, 2, 23, 2, ...",0
4,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 2, 2, 23, 2, ...",0
...,...,...
5995,"[2, 2, 10, 50, 12, 5, 13, 25, 2, 2, 23, 2, 30,...",1
5996,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",0
5997,"[21, 66, 26, 2, 2, 10, 2, 50, 33, 5, 13, 25, 6...",0
5998,"[2, 2, 10, 2, 50, 33, 5, 13, 25, 62, 2, 30, 32...",1


### 2. Applying Under-Sampling Techniques on Testing DataSet

### 2.A. RandomUnderSampler

In [7]:
from imblearn.under_sampling import RandomUnderSampler

# Create an instance of RandomUnderSampler using the RandomUnderSampler() constructor, supplying the Random-State hyper-parameter.
rus = RandomUnderSampler(random_state = 60)

# Slice the DataFrame into X (Features) and Y (Labels)
X, Y = pd.DataFrame(opc_test['opcode'].tolist(), index=opc_test['opcode'].index), opc_test['swc_label']
    
# Applying TomekLinks
X_rus, Y_rus = rus.fit_resample(X, Y)
    
# Create New DataFrame to store Balanced Dataset
temp_df = pd.DataFrame()
temp_df['opcode'] = X_rus.values.tolist()
temp_df['swc_label'] = Y_rus.values.tolist()


# Creates CSV Files of Balanced Dataset(s)
temp_df.to_csv(f'./split_ds/opcode_rus_TEST.csv', index=False)
    
print('Done Applying RandomUnderSampler')

Done Applying RandomUnderSampler


: 