In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf

# CNN metadata:

In [2]:
cle = pd.read_csv('cle_metadata_cnn.csv')
vir = pd.read_csv('vir_metadata_cnn.csv')
hun = pd.read_csv('hun_metadata_cnn.csv')
swi = pd.read_csv('swi_metadata_cnn.csv')

In [3]:
cle_train,cle_test = train_test_split(cle,test_size=0.33, random_state=42)
vir_train,vir_test = train_test_split(vir,test_size=0.33, random_state=42)
hun_train,hun_test = train_test_split(hun,test_size=0.33, random_state=42)
swi_train,swi_test = train_test_split(swi,test_size=0.33, random_state=42)

In [4]:
cle_train.to_csv('cle_metadata_cnn_train.csv',index = False)
cle_test.to_csv('cle_metadata_cnn_test.csv',index = False)

In [5]:
vir_train.to_csv('vir_metadata_cnn_train.csv',index = False)
vir_test.to_csv('vir_metadata_cnn_test.csv',index = False)

In [6]:
hun_train.to_csv('hun_metadata_cnn_train.csv',index = False)
hun_test.to_csv('hun_metadata_cnn_test.csv',index = False)

In [7]:
swi_train.to_csv('swi_metadata_cnn_train.csv',index = False)
swi_test.to_csv('swi_metadata_cnn_test.csv',index = False)

In [8]:
Train = pd.concat([cle_train,vir_train,hun_train,swi_train])
Test = pd.concat([cle_test,vir_test,hun_test,swi_test])

In [9]:
X_train = Train.iloc[:,:-1]
X_test = Test.iloc[:,:-1]

y_train = Train.iloc[:,-1]
y_test = Test.iloc[:,-1]

Y_train_binary = y_train.apply(lambda x: 1 if x > 0 else 0)
Y_test_binary = y_test.apply(lambda x: 1 if x > 0 else 0)

In [10]:
# function for deep learning testing
def Test(path_train,path_test,model_name):
    Train = pd.read_csv(path_train)
    Test = pd.read_csv(path_test)
    
    X_train = Train.iloc[:,:-1]
    Y_train = Train.iloc[:,-1]

    X_test = Test.iloc[:,:-1]
    Y_test = Test.iloc[:,-1]

    #binarize the target
    Y_train_binary = Y_train.apply(lambda x: 1 if x > 0 else 0)
    Y_test_binary = Y_test.apply(lambda x: 1 if x > 0 else 0)

    model = tf.keras.models.load_model(model_name)

    Y_pred = model.predict(X_test).argmax(axis=1)
    
    cm = confusion_matrix(Y_pred, Y_test_binary)
    print(cm)
    print(classification_report(Y_test_binary, Y_pred, digits=4))
    
    mismatch = [i for i, (a,b) in enumerate(zip(Y_pred, Y_test_binary)) if a != b]
    print(mismatch)

# CNN

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# define the model architecture
model = Sequential()
model.add(Conv1D(filters=256, kernel_size=3, activation='relu', input_shape=(64,1)))
model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.fit(X_train, Y_train_binary, epochs=1000, batch_size=32,callbacks=[callback])
Y_pred = model.predict(X_test).argmax(axis=1)
    
cm = confusion_matrix(Y_pred, Y_test_binary)
print(cm)
print(classification_report(Y_test_binary, Y_pred, digits=4))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000

In [None]:
model.save('../Models/Meta_Only/CNNMeta_CNN.h5')

# Test on each dataset

In [None]:
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_CNN.h5'
Test(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_CNN.h5'
Test(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_CNN.h5'
Test(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_CNN.h5'
Test(path_train,path_test,model)

# DNN

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# define the model architecture
model = Sequential()
model.add(Dense(64, input_shape=(64,), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.fit(X_train, Y_train_binary, epochs=1000, batch_size=32,callbacks=[callback])
Y_pred = model.predict(X_test).argmax(axis=1)
    
cm = confusion_matrix(Y_pred, Y_test_binary)
print(cm)
print(classification_report(Y_test_binary, Y_pred, digits=4))

In [None]:
model.save('../Models/Meta_Only/CNNMeta_DNN.h5')

# test on each dataset

In [None]:
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_DNN.h5'
Test(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_DNN.h5'
Test(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_DNN.h5'
Test(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_DNN.h5'
Test(path_train,path_test,model)

# RNN

In [None]:
from keras.layers import Dense, SimpleRNN

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# define the model architecture
model = Sequential()
model.add(SimpleRNN(units=64, return_sequences=True, input_shape=(64, 1)))
model.add(SimpleRNN(units=32, return_sequences=True))
model.add(SimpleRNN(units=16))
model.add(Dense(units=2, activation='sigmoid'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
model.fit(X_train, Y_train_binary, epochs=1000, batch_size=32,callbacks=[callback])
Y_pred = model.predict(X_test).argmax(axis=1)
    
cm = confusion_matrix(Y_pred, Y_test_binary)
print(cm)
print(classification_report(Y_test_binary, Y_pred, digits=4))

In [None]:
model.save('../Models/Meta_Only/CNNMeta_RNN.h5')

In [None]:
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_RNN.h5'
Test(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_RNN.h5'
Test(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_RNN.h5'
Test(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_Only/CNNMeta_RNN.h5'
Test(path_train,path_test,model)

# Machine Learning

# Decision Tree

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt

In [None]:
# decision tree
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train, Y_train_binary)
Y_predictions = clf.predict(X_test)
cm = confusion_matrix(Y_predictions, Y_test_binary)
print("Confusion Matrix is:")
print(cm)
def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements
print("Accuracy is : " + str(accuracy(cm)))
    
print("Report")
print(classification_report(Y_test_binary, Y_predictions))
    
#Visualization of the tree
plt.figure(figsize=(25,20))
tree.plot_tree(clf, filled=True, fontsize=10, max_depth= None,feature_names = list(X_train.columns), class_names = True)
plt.show()

In [None]:
# save clf model
from joblib import dump, load
dump(clf, '../Models/Meta_only/CNNMeta_dt.joblib')

In [None]:
# function for Decision tree, and random forest testing
def Test_DT(path_train,path_test,model_name):
    Train = pd.read_csv(path_train)
    Test = pd.read_csv(path_test)
    
    X_train = Train.iloc[:,:-1]
    Y_train = Train.iloc[:,-1]

    X_test = Test.iloc[:,:-1]
    Y_test = Test.iloc[:,-1]

    #binarize the target
    Y_train_binary = Y_train.apply(lambda x: 1 if x > 0 else 0)
    Y_test_binary = Y_test.apply(lambda x: 1 if x > 0 else 0)

    model = load(model_name)

    Y_pred = model.predict(X_test)
    
    cm = confusion_matrix(Y_pred, Y_test_binary)
    print(cm)
    print(classification_report(Y_test_binary, Y_pred, digits=4))
    
    mismatch = [i for i, (a,b) in enumerate(zip(Y_pred, Y_test_binary)) if a != b]
    print(mismatch)

In [None]:
#test on each dataset for decision tree
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_dt.joblib'
Test_DT(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_dt.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_dt.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_dt.joblib'
Test_DT(path_train,path_test,model)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, Y_train_binary)
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_pred, Y_test_binary)
print(cm)
print(classification_report(Y_test_binary, Y_pred))

In [None]:
dump(classifier, '../Models/Meta_only/CNNMeta_rf.joblib')

In [None]:
#test on each dataset for Random Forest
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_rf.joblib'
Test_DT(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_rf.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_rf.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_rf.joblib'
Test_DT(path_train,path_test,model)

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train, Y_train_binary.values.ravel())
y_pred = svc.predict(X_test)
print(confusion_matrix(Y_test_binary, y_pred))
print(classification_report(Y_test_binary, y_pred))

In [None]:
import joblib
joblib.dump(svc, "../Models/Meta_only/CNNMeta_svm.pkl") 

In [None]:
#test on each dataset for SVM
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_svm.pkl'
Test_DT(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_svm.pkl'
Test_DT(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_svm.pkl'
Test_DT(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_svm.pkl'
Test_DT(path_train,path_test,model)

# Bayesian Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, Y_train_binary)
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_pred, Y_test_binary)
print(cm)
print(classification_report(Y_test_binary, Y_pred))

dump(clf, '../Models/Meta_only/CNNMeta_NB.joblib')

In [None]:
print("Cle test")
path_train = 'cle_metadata_cnn_train.csv'
path_test = 'cle_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_NB.joblib'
Test_DT(path_train,path_test,model)

print("==================")

print("vir test")
path_train = 'vir_metadata_cnn_train.csv'
path_test = 'vir_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_NB.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("hun test")
path_train = 'hun_metadata_cnn_train.csv'
path_test = 'hun_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_NB.joblib'
Test_DT(path_train,path_test,model)

print("==================")
print("swi test")
path_train = 'swi_metadata_cnn_train.csv'
path_test = 'swi_metadata_cnn_test.csv'
model = '../Models/Meta_only/CNNMeta_NB.joblib'
Test_DT(path_train,path_test,model)