In [7]:
import pickle
import os
import numpy as np
from tqdm import tqdm_notebook
from sklearn import model_selection
from sklearn import decomposition
import networkx as nx
from networkx.readwrite import write_gexf
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

### Loading APT Data

In [8]:
f = open("data/apt_data.pkl", "rb")
data = pickle.load(f)
f.close()

In [9]:
apt = []
file = []
embeddings = []
for k in data.keys():
    emb = [f['embedding'] for f in data[k]['functions'] if f['lenght'] > 20]
    emb = np.asarray(emb)
    if emb.shape[0] > 0 and data[k]['apt'] != 'Lazarus Group':
        embeddings.append(emb)
        apt.append(data[k]['apt'].replace("\n",""))
        file.append(k)
assert len(apt) == len(embeddings)

### Preparing folds for cross validation

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
fold = []
for train_index, test_index in skf.split(embeddings, apt):
    fold.append(test_index)

In [11]:
def project(sample, space, SIM_THRESHOLD=0.95):
    projection = []
    for j in range(0, len(space)):
        dot = np.dot(sample, space[j].T)
        dist = np.sum(np.sum(dot > SIM_THRESHOLD, axis=1) > 0) / max(sample.shape[0], space[j].shape[0])
        projection.append(dist)
    return projection

In [13]:
C = [100]
gamma = [0.1]

result = []
for i in range(0, 5):
    print("Val fold: {}".format(i))
    val_emb = [embeddings[f] for f in fold[i]]
    val_apt = [apt[f] for f in fold[i]]
    for j in range(0, 5):
        if i == j:
            continue
        projection_emb = [embeddings[f] for f in fold[j]]
        train_fold = set(range(0,5)).difference(set([i,j]))
        train_emb = []
        train_apt = []
        for ff in train_fold:
            train_emb.extend([embeddings[f] for f in fold[ff]])
            train_apt.extend([apt[f] for f in fold[ff]])
        print("Projection fold: {} Train fold: {}".format(j, train_fold))
        print("Projecting training data")
        train_data = []
        for k in tqdm_notebook(range(0, len(train_emb))):
            train_data.append(project(train_emb[k], projection_emb))
            
        print("Projecting validation data")
        val_data = []
        for k in tqdm_notebook(range(0, len(val_emb))):
            val_data.append(project(val_emb[k], projection_emb))

        max_acc = 0
        best_c = None
        best_gamma = None

        print("Classification")
        for c in tqdm_notebook(C):
            for g in gamma:
                clf = SVC(C=c, gamma=g, kernel="rbf")
                clf.fit(train_data, train_apt)
                predictions = clf.predict(val_data)
                acc = metrics.accuracy_score(val_apt, predictions)
                if acc > max_acc:
                    best_c = c
                    best_gamma = g
                    best_predictions = predictions
                    max_acc = acc
                    
        print("Best C: {}, Best GAMMA: {}, Validation Accuracy {}".format(best_c, best_gamma, max_acc))
        
        obj = {
            "val_fold":i,
            "projection_fold":j,
            "train_fold":train_fold,
            "C":best_c,
            "gamma": best_gamma,
            "accuracy": max_acc,
            "predictions":best_predictions,
            "label":val_apt
        }
        
        result.append(obj)


Val fold: 0
Projection fold: 1 Train fold: {2, 3, 4}
Projecting training data


HBox(children=(IntProgress(value=0, max=984), HTML(value='')))


Projecting validation data


HBox(children=(IntProgress(value=0, max=338), HTML(value='')))


Classification


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Best C: 100, Best GAMMA: 0.1, Validation Accuracy 0.878698224852071
Projection fold: 2 Train fold: {1, 3, 4}
Projecting training data


HBox(children=(IntProgress(value=0, max=988), HTML(value='')))


Projecting validation data


HBox(children=(IntProgress(value=0, max=338), HTML(value='')))


Classification


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Best C: 100, Best GAMMA: 0.1, Validation Accuracy 0.8579881656804734
Projection fold: 3 Train fold: {1, 2, 4}
Projecting training data


HBox(children=(IntProgress(value=0, max=990), HTML(value='')))

KeyboardInterrupt: 

In [None]:
import pickle
f = open('result_cross_val_svm.pkl','wb')
pickle.dump(result, f)
f.close()

In [None]:
actual_val_fold = i
cum_report = None
cnf = np.zeros([len(set(result[0]['label'])),len(set(result[0]['label']))])
labels = list(set(result[0]['label']))
for r in result:
    report = metrics.classification_report(r['predictions'], r['label'], output_dict=True)
    cnf = (cnf + metrics.confusion_matrix(r['predictions'], r['label'], labels=labels)) / 2
    if cum_report == None:
        cum_report = report
    else:
        for k in report.keys():
            if k != 'accuracy':
                cum_report[k]['precision'] = (cum_report[k]['precision'] + report[k]['precision']) / 2
                cum_report[k]['recall'] = (cum_report[k]['recall'] + report[k]['recall']) / 2
                cum_report[k]['f1-score'] = (cum_report[k]['f1-score'] + report[k]['f1-score']) / 2
                cum_report[k]['support'] = (cum_report[k]['support'] + report[k]['support']) / 2
            else:
                cum_report[k] = (cum_report[k] + report[k]) / 2

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
labels[4] = 'Sandworm'

In [None]:
df_cm = pd.DataFrame(cnf, index=labels, columns=labels)
df_cm = df_cm.div(df_cm.sum(axis=1), axis=0)
fig = plt.figure(figsize=(10,10))
heatmap = sns.heatmap(df_cm, annot=True, fmt=".2f", square=True, cmap = sns.cm.rocket_r)
plt.savefig("confusion_matrix_cross_val_svm.png")

In [None]:
report = pd.DataFrame(cum_report).T.round(3)

In [None]:
report

In [None]:
f = open("classification_result_svm_cross_val.tex","w")
f.write(report.to_latex())
f.close()