In [42]:
import numpy as np
import pandas as pd
import pandasql as pdsql
from datetime import datetime
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.ensemble import *
from sklearn.metrics import accuracy_score, roc_curve, auc



pysql = lambda q: pdsql.sqldf(q, globals())

features = pd.read_csv('data/amico-features-export.csv.gz', compression='gzip')
features.sort_values(by=['dump_id'],inplace=True)
features.set_index(features['dump_id'],inplace=True)

metadata = pd.read_csv('data/amico-export.csv.gz', compression='gzip')
metadata.sort_values(by=['dump_id'],inplace=True)
metadata['date'] = pd.to_datetime(metadata['date'])
metadata = metadata[metadata['dump_id'].isin(features['dump_id'])]
metadata.set_index(metadata['dump_id'],inplace=True)

print ("Number of rows in metadata =", len(metadata))
print ("Number of rows in features =", len(features))

Number of rows in metadata = 121545
Number of rows in features = 121545


In [43]:
data = features.join(metadata[['dump_id','date','md5','host','type','max_tavs','max_avs','score']], how='inner', rsuffix='_d')
data.sort_values(by=['dump_id'],inplace=True)

print ("Number of rows in data after join =", len(data))

Defaulting to column, but this will raise an ambiguity error in a future version
  


Number of rows in data after join = 121545


In [44]:
dataset = data.copy()
dataset = dataset.loc[~dataset['max_avs'].isnull()]
print ("Number of remaining rows =", len(dataset))
# Removing those values where RF is not giving score
dataset = dataset.loc[~dataset['score'].isnull()]
print ("Number of remaining rows =", len(dataset))

Number of remaining rows = 112472
Number of remaining rows = 112361


In [45]:
def label_downloads(avs_count, avs_count1, threshold):
    avs_count = list(avs_count)
    avs_count1 = list(avs_count1)
    labels = ['benign']*len(avs_count)
    for i in range(len(avs_count)):
        if avs_count[i] >= threshold : 
            labels[i] = 'malware'
        elif ((avs_count1[i] > 0) and (avs_count[i] < 2)) :
            labels[i] = 'unknown'
    return labels

threshold = 2
avs_count = dataset['max_tavs']
avs_count1 = dataset['max_avs']
dataset['avs5'] = label_downloads(avs_count, avs_count1, threshold)

dataset = dataset.loc[dataset['avs5'] != 'unknown']
print ("Number of remaining rows =", len(dataset))

print ("Dataset: instances =",dataset.shape[0], "  features =",dataset.shape[1])


Number of remaining rows = 94626
Dataset: instances = 94626   features = 80


In [46]:
labels_count = Counter(list(dataset['avs5']))
print ("Labels count:", labels_count)

type_count = Counter(list(dataset['type']))
print ("Type count:", type_count)

Labels count: Counter({'benign': 90077, 'malware': 4549})
Type count: Counter({'EXE': 36779, 'JAR': 31613, 'APK': 16069, 'DMG': 10165})


In [47]:
# One-hot-encoding
cfs = ['extension_class','type']
for fn in cfs:
    ohe_feat = pd.get_dummies(dataset[fn], prefix=fn)
    dataset = dataset.drop([fn], axis=1)
    dataset = pd.concat((dataset, ohe_feat), axis=1)

print ("Dataset: instances =",dataset.shape[0], "  features =",dataset.shape[1])

Dataset: instances = 94626   features = 88


In [50]:
trainining_end_date = '2017-04-01'

training_data = dataset[dataset['date']<trainining_end_date]
test_data = dataset[dataset['date']>=trainining_end_date]

print ("Number of training data rows =", len(training_data))
print ("Number of test data rows =", len(test_data))

labels_count = Counter(list(training_data['avs5']))
print ("Training labels count:", labels_count)

labels_count = Counter(list(test_data['avs5']))
print ("Test labels count:", labels_count)

Number of training data rows = 36379
Number of test data rows = 58247
Training labels count: Counter({'benign': 34373, 'malware': 2006})
Test labels count: Counter({'benign': 55704, 'malware': 2543})


In [57]:
train = training_data.copy()
test = test_data.copy()

train_Y = list(train['avs5'])
test_Y  = list(test['avs5'])

remove_cols = ['hash_daily_dump_rate_per_client', 'url_struct_malware_downloads', 'url_struct_total_downloads', 'url_struct_distinct_sha1s', 'vt_month_shelf', 'score', 'raw_dump_trusted_av_labels', 'raw_dump_num_av_labels', 'dump_id','corrupt','sha1','host','url_struct','dump_id_d','date','md5','host_d','max_tavs','max_avs']

for c in remove_cols:
    del train[c]
    del test[c]

print ("Train: instances =",train.shape[0], "  features =",train.shape[1])
print ("Test:  instances =",test.shape[0], "  features =",test.shape[1]) 

#train.fillna(-1,inplace=True)
#test.fillna(-1,inplace=True)

#test.to_csv("data/data/tmp_test.csv")
#train.to_csv("data/data/tmp_train.csv")

Train: instances = 36379   features = 69
Test:  instances = 58247   features = 69


In [58]:
def compute_partial_auc(fpr, tpr, fpr_max):
    partial_fpr = [fpr[i] for i in range(len(fpr)) if fpr[i] <= fpr_max]
    partial_tpr = [tpr[i] for i in range(len(fpr)) if fpr[i] <= fpr_max]
    partial_fpr = partial_fpr + [fpr_max]
    partial_tpr = partial_tpr + [tpr[-1]]
    partial_auc = (auc(partial_fpr, partial_tpr) / (fpr_max))
    #print(partial_fpr)
    return partial_auc

In [59]:
# imputation of missing values by substituting -1 for compatibility with RF implementation
def RF_2(train, test):
    
    #train.fillna(-1,inplace=True)
    train.fillna(train.mean(),inplace=True)
    clf = RandomForestClassifier(n_estimators=100,random_state=12345, min_impurity_decrease= 0.000000015)
    clf.fit(train, train_Y)
    #test.fillna(-1,inplace=True)
    test.fillna(test.mean(),inplace=True)

    Y_hat = clf.predict(test)
    acc = accuracy_score(test_Y,Y_hat)
    
    #error = abs(Y_hat - test_Y)
    #mape = 100 * (error/test_Y)
    #accuracy = 100 - np.mean(mape)
    print('Accuracy:', acc, '%.')
    
    scores = pd.DataFrame(clf.predict_proba(test))
    scores.columns = ['benign','malware']
    fpr, tpr, th = roc_curve(test_Y, scores['malware'], pos_label='malware')
    return (fpr,tpr)
    

In [60]:
def plot_ROC(fpr, tpr, perc, fpr_max=0.01, title='ROC curves'):
    plt.figure(num=None, figsize=(10, 7))
    lw = 3

    plt.plot([0, 100], [100, 100], color='lightgray', lw=1, linestyle='--')
    plt.plot([0, 100], [90, 90], color='lightgray', lw=1, linestyle='--')
    plt.plot([0, 100], [80, 80], color='lightgray', lw=1, linestyle='--')
    plt.plot([1.0, 1.0], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.5, 0.5], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.2, 0.2], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.1, 0.1], [50, 105], color='lightgray', lw=1, linestyle='--')
    
    pauc = compute_partial_auc(fpr1, tpr1, fpr_max)
    plt.plot(fpr*100, tpr*100, color= 'blue', lw=lw, label='Perc = %s, PAUC = %.4f' % (perc, pauc))

    plt.xlim([0.0, fpr_max*100])
    plt.ylim([50, 105])
    plt.xlabel('False Positive (%)')
    plt.ylabel('True Positives (%)')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig('data/missing_values/percentage_missing '+ str(perc) +'.pdf')

    plt.show()
   
    
    
#plot_ROC(fpr,tpr)

In [61]:
temp_cols = list()

tmp_train = train.copy()
tmp_test = test.copy()

temp_cols = (train.columns[train.isnull().mean() > 0])
temp_cols

Index(['host_malware_downloads', 'host_suspicious_downloads',
       'host_benign_downloads', 'host_total_downloads', 'host_malware_ratio',
       'host_suspicious_ratio', 'host_benign_ratio', 'host_avg_av_labels',
       'host_avg_trusted_labels', 'host_unknown_hashes', 'host_total_hashes',
       'host_unknown_hash_ratio', 'twold_malware_downloads',
       'twold_suspicious_downloads', 'twold_benign_downloads',
       'twold_total_downloads', 'twold_malware_ratio',
       'twold_suspicious_ratio', 'twold_benign_ratio', 'twold_avg_av_labels',
       'twold_avg_trusted_labels', 'twold_unknown_hashes',
       'twold_total_hashes', 'twold_unknown_hash_ratio',
       'server_ip_malware_downloads', 'server_ip_suspicious_downloads',
       'server_ip_benign_downloads', 'server_ip_total_downloads',
       'server_ip_malware_ratio', 'server_ip_suspicious_ratio',
       'server_ip_benign_ratio', 'server_ip_avg_av_labels',
       'server_ip_avg_trusted_labels', 'server_ip_unknown_hashes',
     

In [63]:
#count = 0
#def cate_isnull (a, b):
#    print(a, ' ', b)
#    if a is None & b == 'benign':
#        return -1
#    elif a is None & b =='malware':
#        return -2
    
    #else:
        #print(row['avs5'])
        #return float(0.0)
    
def column_isnull (row, c):
    if pd.isna(row[c]) == True:
        if row['avs5'] == 'benign':
            return float(-1.0)
        elif row['avs5'] == 'malware':
            return float(-2.0)


#tmp_train['categorical_isnull_server_ip_malware_ratio'] = np.where(tmp_train['server_ip_malware_ratio'].isnull(), 1.0, 0.0)

for c in temp_cols:
    #print (tmp_test.apply(lambda row: cate_isnull (row,c), axis=1))
    #tmp_test['categorical_isnull_%s' %c] = tmp_test.apply(lambda row: cate_isnull (row,c), axis=1)
    #tmp_train['categorical_isnull_%s' %c] = tmp_train.apply(lambda row: cate_isnull (row[c]), axis=1)
    #tmp_train[c] = tmp_train.apply(lambda row: column_isnull (row,c), axis=1)
    #print (tmp_train.apply(lambda row: cate_isnull (row[c]), axis=1))
    tmp_train['categorical_isnull_%s' %c] = np.where(tmp_train[c].isnull(), 1.0, 0.0)
    tmp_test['categorical_isnull_%s' %c] = np.where(tmp_test[c].isnull(), 1.0, 0.0)
    
d = 'avs5'   

#for c in temp_cols:
#    tmp_train[c] = np.where(((tmp_train[c].isnull()) & (tmp_train.avs5 == 'malware')), tmp_train[c].fillna(-1), inplace=True), tmp_train[c])
#    tmp_train[c] = np.where(((tmp_train[c].isnull()) & (tmp_train.avs5 == 'benign')), tmp_train[c].fillna(-1), inplace=True), tmp_train[c])
    
#for c in temp_cols:
#    del tmp_test[c]
#    del tmp_train[c]

tmp_train.fillna(-1,inplace=True)
tmp_test.fillna(-1,inplace=True)

tmp_test.to_csv("data/data/tmp_test.csv")
tmp_train.to_csv("data/data/tmp_train.csv")

In [33]:
x = 1.0

del tmp_train['avs5']
del tmp_test['avs5']

fpr1, tpr1 = RF_2(tmp_train, tmp_test)
    
plot_ROC(fpr1, tpr1, x)

Accuracy: 0.9959482891822755 %.
