In [1]:
import csv
import numpy as np
import pandas as pd
import ClassifierModels as cm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
def standardize_dataset(df, y_column_name):
    y = df[y_column_name]
    X = df.drop([y_column_name], axis=1)

    # standardize
    sc = StandardScaler()
    np_scaled = sc.fit_transform(X)
    df_normalized = pd.DataFrame(np_scaled, columns = X.columns)

    result = pd.concat([df_normalized, y], axis=1)

    return result, np_scaled, y

In [3]:
#Read files
def file_reader(file_path):
    '''Input = file path (str)
       Output = numpy array of items in files
    '''
    
    data = []
    with open(file_path) as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            for x in row:
                x=x.split(' ')
                example = []
                for item in x:
                    if item:
                        item = int(item) #convert to int
                        example.append(item)
                data.append(example)
        data = np.asarray(data)
    return data

In [4]:
## arcene
dbName = 'arcene'

arcene_train_X = file_reader('hd-datasets/ARCENE/arcene_train.data')
arcene_test_X = file_reader('hd-datasets/ARCENE/arcene_valid.data')

arcene_train_y = file_reader('hd-datasets/ARCENE/arcene_train.labels')
arcene_train_y = np.ravel(arcene_train_y)
arcene_test_y = file_reader('hd-datasets/ARCENE/arcene_valid.labels')
arcene_test_y = np.ravel(arcene_test_y)

arcene_train = np.column_stack( (arcene_train_X,arcene_train_y) )
arcene_test = np.column_stack( (arcene_test_X,arcene_test_y) )
arcene = np.row_stack( (arcene_train,arcene_test) )

data_df = pd.DataFrame.from_records(arcene)
y_column_name = 10000

le = LabelEncoder()
data_df[y_column_name] = le.fit_transform(data_df[y_column_name])

y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [5]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87        22
           1       0.88      0.78      0.82        18

    accuracy                           0.85        40
   macro avg       0.85      0.84      0.85        40
weighted avg       0.85      0.85      0.85        40

              precision    recall  f1-score   support

           0       0.84      0.95      0.89        22
           1       0.93      0.78      0.85        18

    accuracy                           0.88        40
   macro avg       0.89      0.87      0.87        40
weighted avg       0.88      0.88      0.87        40

              precision    recall  f1-score   support

           0       0.73      0.73      0.73        22
           1       0.67      0.67      0.67        18

    accuracy                           0.70        40
   macro avg       0.70      0.70      0.70        40
weighted avg       0.70      0.70      0.70        40

              preci

In [6]:
## gene
dbName = 'gene'

data_df = pd.read_csv('hd-datasets/gene/gene-modified.csv',dtype=np.float32)
y_column_name = 'Class'
y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [7]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        60
         1.0       1.00      1.00      1.00        16
         2.0       1.00      0.97      0.98        29
         3.0       1.00      1.00      1.00        28
         4.0       1.00      1.00      1.00        28

    accuracy                           0.99       161
   macro avg       1.00      0.99      0.99       161
weighted avg       0.99      0.99      0.99       161

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        60
         1.0       1.00      1.00      1.00        15
         2.0       1.00      1.00      1.00        29
         3.0       1.00      1.00      1.00        29
         4.0       1.00      1.00      1.00        27

    accuracy                           1.00       160
   macro avg       1.00      1.00      1.00       160
weighted avg       1.00      1.00      1.00       160

              precisio

In [8]:
# gisette
dbName = 'gisette'

gisette_train_X = file_reader('hd-datasets/gisette/gisette_train.data')
gisette_test_X = file_reader('hd-datasets/gisette/gisette_valid.data')

gisette_train_y = file_reader('hd-datasets/gisette/gisette_train.labels')
gisette_train_y = np.ravel(gisette_train_y)
gisette_test_y = file_reader('hd-datasets/gisette/gisette_valid.labels')
gisette_test_y = np.ravel(gisette_test_y)

gisette_train = np.column_stack( (gisette_train_X,gisette_train_y) )
gisette_test = np.column_stack( (gisette_test_X,gisette_test_y) )
gisette = np.row_stack( (gisette_train,gisette_test) )

data_df = pd.DataFrame.from_records(gisette)
y_column_name = 5000

le = LabelEncoder()
data_df[y_column_name] = le.fit_transform(data_df[y_column_name])

y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [9]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       700
           1       0.99      0.97      0.98       700

    accuracy                           0.98      1400
   macro avg       0.98      0.98      0.98      1400
weighted avg       0.98      0.98      0.98      1400

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       700
           1       0.98      0.98      0.98       700

    accuracy                           0.98      1400
   macro avg       0.98      0.98      0.98      1400
weighted avg       0.98      0.98      0.98      1400

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       700
           1       0.97      0.97      0.97       700

    accuracy                           0.97      1400
   macro avg       0.97      0.97      0.97      1400
weighted avg       0.97      0.97      0.97      1400

              preci

In [10]:
## madelon
dbName = 'madelon'

madelon_train_X = file_reader('hd-datasets/MADELON/madelon_train.data')
madelon_test_X = file_reader('hd-datasets/MADELON/madelon_valid.data')

madelon_train_y = file_reader('hd-datasets/MADELON/madelon_train.labels')
madelon_train_y = np.ravel(madelon_train_y)
madelon_test_y = file_reader('hd-datasets/MADELON/madelon_valid.labels')
madelon_test_y = np.ravel(madelon_test_y)

madelon_train = np.column_stack( (madelon_train_X,madelon_train_y) )
madelon_test = np.column_stack( (madelon_test_X,madelon_test_y) )
madelon = np.row_stack( (madelon_train,madelon_test) )

data_df = pd.DataFrame.from_records(madelon)
y_column_name = 500

le = LabelEncoder()
data_df[y_column_name] = le.fit_transform(data_df[y_column_name])

y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [11]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       260
           1       0.84      0.81      0.82       260

    accuracy                           0.83       520
   macro avg       0.83      0.83      0.83       520
weighted avg       0.83      0.83      0.83       520

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       260
           1       0.82      0.82      0.82       260

    accuracy                           0.82       520
   macro avg       0.82      0.82      0.82       520
weighted avg       0.82      0.82      0.82       520

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       260
           1       0.85      0.84      0.84       260

    accuracy                           0.84       520
   macro avg       0.84      0.84      0.84       520
weighted avg       0.84      0.84      0.84       520

              preci

In [12]:
## parkinson
dbName = 'parkinson'


data_df = pd.read_csv('hd-datasets/parkinson/pd_speech_features.csv',dtype=np.float32)
y_column_name = 'class'
y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [13]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

         0.0       0.85      0.56      0.68        39
         1.0       0.87      0.96      0.91       113

    accuracy                           0.86       152
   macro avg       0.86      0.76      0.79       152
weighted avg       0.86      0.86      0.85       152

              precision    recall  f1-score   support

         0.0       0.83      0.63      0.72        38
         1.0       0.89      0.96      0.92       113

    accuracy                           0.87       151
   macro avg       0.86      0.79      0.82       151
weighted avg       0.87      0.87      0.87       151

              precision    recall  f1-score   support

         0.0       0.92      0.61      0.73        38
         1.0       0.88      0.98      0.93       113

    accuracy                           0.89       151
   macro avg       0.90      0.79      0.83       151
weighted avg       0.89      0.89      0.88       151

              preci

In [14]:
## malware
dbName = 'malware'

df_1 = pd.read_csv('hd-datasets/VxHeaven/staDynVt2955Lab.csv',dtype=np.float32)
df_2 = pd.read_csv('hd-datasets/VxHeaven/staDynVxHeaven2698Lab.csv',dtype=np.float32)

frames = [df_1, df_2]
data_df = pd.concat(frames, ignore_index=True)

y_column_name = 'label'

y = data_df[y_column_name].to_numpy()
X_data = data_df.drop([y_column_name], axis=1).to_numpy()

In [15]:
clfmodel = 'GBT'
accuracy_mean, accuracy_std = cm.crossValid (X_data, y, clfmodel, nfolds=5)

print(dbName)
print(accuracy_mean)
print(accuracy_std)

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       540
         1.0       0.96      0.96      0.96       591

    accuracy                           0.96      1131
   macro avg       0.96      0.96      0.96      1131
weighted avg       0.96      0.96      0.96      1131

              precision    recall  f1-score   support

         0.0       0.95      0.98      0.97       540
         1.0       0.98      0.96      0.97       591

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       540
         1.0       0.97      0.97      0.97       591

    accuracy                           0.97      1131
   macro avg       0.97      0.97      0.97      1131
weighted avg       0.97      0.97      0.97      1131

              preci