In [1]:
import numpy as np
import pandas as pd
import pandasql as pdsql
from datetime import datetime

pysql = lambda q: pdsql.sqldf(q, globals())

features = pd.read_csv('data/amico-features-export.csv.gz', compression='gzip')
features.sort_values(by=['dump_id'],inplace=True)
features.set_index(features['dump_id'],inplace=True)

metadata = pd.read_csv('data/amico-export.csv.gz', compression='gzip')
metadata.sort_values(by=['dump_id'],inplace=True)
metadata['date'] = pd.to_datetime(metadata['date'])
metadata = metadata[metadata['dump_id'].isin(features['dump_id'])]
metadata.set_index(metadata['dump_id'],inplace=True)

print ("Number of rows in metadata =", len(metadata))
print ("Number of rows in features =", len(features))


Number of rows in metadata = 121545
Number of rows in features = 121545


In [2]:
# metadata = metadata.loc[metadata['type'] == 'DMG']
print ("Number of rows in metadata =", len(metadata))

Number of rows in metadata = 121545


In [20]:
data = features.join(metadata[['dump_id','date','md5','host','type','max_tavs','max_avs','score']], how='inner', rsuffix='_d')
data.sort_values(by=['dump_id'],inplace=True)

print ("Number of rows in data after join =", len(data))

Defaulting to column, but this will raise an ambiguity error in a future version
  


Number of rows in data after join = 121545


In [5]:
from collections import Counter

def filter_count(count_dict, th=0):
    new_dict = dict()
    for k in count_dict:
        if count_dict[k] > th:
            new_dict[k] = count_dict[k]
    return new_dict

md5_count = Counter(data['md5'])
host_count = Counter(data['host'])

md5_count_filtered = dict()
host_count_filtered = dict()

# md5_count_filtered = filter_count(md5_count,100)
# host_count_filtered = filter_count(host_count,1000)

In [6]:
# filtering out popular md5s and hosts
dataset = data.copy()
print ("Number of rows =", len(dataset))
dataset = dataset.loc[~dataset['md5'].isin(md5_count_filtered.keys())]
print ("Number of remaining rows =", len(dataset))
dataset = dataset.loc[~dataset['host'].isin(host_count_filtered.keys())]
print ("Number of remaining rows =", len(dataset))
dataset = dataset.loc[~dataset['max_tavs'].isnull()]
print ("Number of remaining rows =", len(dataset))
# Removing those values where RF is not giving score
dataset = dataset.loc[~dataset['score'].isnull()]
print ("Number of remaining rows =", len(dataset))

Number of rows = 121545
Number of remaining rows = 121545
Number of remaining rows = 121545
Number of remaining rows = 112472
Number of remaining rows = 112361


In [12]:
def label_downloads(avs_count, avs_count1, threshold):
    avs_count = list(avs_count)
    avs_count1 = list(avs_count1)
    labels = ['benign']*len(avs_count)
    for i in range(len(avs_count)):
        if avs_count[i] >= threshold : 
            labels[i] = 'malware'
        elif ((avs_count1[i] > 0) and (avs_count[i] < 2)) :
            labels[i] = 'unknown'
    return labels

threshold = 2
avs_count = dataset['max_tavs']
avs_count1 = dataset['max_avs']
dataset['avs5'] = label_downloads(avs_count, avs_count1, threshold)

dataset = dataset.loc[dataset['avs5'] != 'unknown']
print ("Number of remaining rows =", len(dataset))

print ("Dataset: instances =",dataset.shape[0], "  features =",dataset.shape[1])




Number of remaining rows = 94626
Dataset: instances = 94626   features = 80


In [13]:
labels_count = Counter(list(dataset['avs5']))
print ("Labels count:", labels_count)

type_count = Counter(list(dataset['type']))
print ("Type count:", type_count)

Labels count: Counter({'benign': 90077, 'malware': 4549})
Type count: Counter({'EXE': 36779, 'JAR': 31613, 'APK': 16069, 'DMG': 10165})


In [14]:
# One-hot-encoding
cfs = ['extension_class','type']
for fn in cfs:
    ohe_feat = pd.get_dummies(dataset[fn], prefix=fn)
    dataset = dataset.drop([fn], axis=1)
    dataset = pd.concat((dataset, ohe_feat), axis=1)

print ("Dataset: instances =",dataset.shape[0], "  features =",dataset.shape[1])

Dataset: instances = 94626   features = 88


In [23]:
trainining_end_date = '2017-04-01'

training_data = dataset[dataset['date']<trainining_end_date]
test_data = dataset[dataset['date']>=trainining_end_date]

print ("Number of test data rows =", len(training_data))
print ("Number of training data rows =", len(test_data))

labels_count = Counter(list(training_data['avs5']))
print ("Training labels count:", labels_count)

labels_count = Counter(list(test_data['avs5']))
print ("Test labels count:", labels_count)

#dataset

Number of test data rows = 36379
Number of training data rows = 58247
Training labels count: Counter({'benign': 34373, 'malware': 2006})
Test labels count: Counter({'benign': 55704, 'malware': 2543})


In [30]:
print (dataset.dtypes)

dump_id                                  int64
raw_dump_num_av_labels                 float64
raw_dump_trusted_av_labels             float64
vt_month_shelf                         float64
corrupt                                 object
host_malware_downloads                 float64
host_suspicious_downloads              float64
host_benign_downloads                  float64
host_total_downloads                   float64
host_malware_ratio                     float64
host_suspicious_ratio                  float64
host_benign_ratio                      float64
host_avg_av_labels                     float64
host_avg_trusted_labels                float64
host_unknown_hashes                    float64
host_total_hashes                      float64
host_unknown_hash_ratio                float64
twold_malware_downloads                float64
twold_suspicious_downloads             float64
twold_benign_downloads                 float64
twold_total_downloads                  float64
twold_malware

In [35]:
train = training_data.copy()
test = test_data.copy()

train_Y = list(train['avs5'])
test_Y  = list(test['avs5'])
remove_cols = ['vt_month_shelf', 'raw_dump_trusted_av_labels', 'raw_dump_num_av_labels', 'dump_id','score','corrupt','sha1','host','url_struct','dump_id_d','date','md5','host_d','max_tavs','max_avs','avs5']
#remove_cols = remove_cols + ['type_APK','type_DMG','type_JAR','type_EXE']
#remove_cols = remove_cols + ['extension_class_common_ext', 'extension_class_common_fake', 'extension_class_no_ext', 'extension_class_no_url','extension_class_other_ext','extension_class_unknown_ext']
remove_cols = remove_cols + ['url_struct_malware_downloads','url_struct_total_downloads','url_struct_distinct_sha1s','server_ip_avg_av_labels','server_ip_avg_trusted_labels','hash_daily_dump_rate_per_client']
#remove_cols = remove_cols + ['server_ip_malware_ratio','server_ip_suspicious_ratio','server_ip_benign_ratio','server_ip_unknown_hash_ratio','host_unknown_hashes','host_total_hashes']
#remove_cols = remove_cols + ['host_unknown_hash_ratio','twold_total_downloads','url_malware_downloads','bgp_total_downloads']


for c in remove_cols:
    del train[c]
    del test[c]

#temp_cols = train.columns[train.isnull().mean() > 0]
#temp_cols
#mal_type = train
#for d in temp_cols:
#    sql_query = """SELECT * FROM mal_type Where {c} IS NULL;""".format(c = d)
#    mal_type = pdsql.sqldf(sql_query, locals())
#    print(mal_type)
#    break
train.to_csv("data/train.csv")
    
#all_cols = list(dataset.columns[0:])

#remove_cols = ['host_malware_downloads','host_suspicious_downloads','host_malware_ratio','host_suspicious_ratio','twold_malware_ratio','twold_suspicious_ratio','server_ip_malware_ratio','server_ip_benign_ratio','bgp_unknown_hash_ratio','estimated_clients_with_same_hash','referer_exists','url_length','url_malware_downloads','directory_depth']

#partial_cols = [x for x in all_cols if x not in remove_cols]

#for c in partial_cols:
#    del train[c]
#    del test[c]

#print ("Train: instances =",train.shape[0], "  features =",train.shape[1])
#print ("Test:  instances =",test.shape[0], "  features =",test.shape[1]) 


In [18]:
# train[100:110]

In [36]:
from sklearn.ensemble import *

# imputation of missing values by substituting -1 for compatibility with RF implementation
train.fillna(-1,inplace=True)
clf = RandomForestClassifier(n_estimators=100, random_state=12345)
clf.fit(train, train_Y)
print (clf.feature_importances_)

[1.13874922e-01 6.35464314e-04 4.70113577e-02 3.91689662e-04
 5.83793992e-02 1.67741735e-02 1.57846062e-02 6.35630446e-02
 1.04583586e-01 3.11366332e-04 1.72432546e-04 5.48815335e-04
 2.58400011e-02 1.55522589e-02 4.16173743e-02 8.48878984e-04
 1.17487438e-01 2.80451050e-02 4.54364702e-02 1.09134516e-01
 9.77418041e-02 1.17065746e-04 8.03316389e-04 1.80082753e-04
 2.17397225e-03 7.23092023e-04 1.56965265e-04 3.58049952e-04
 6.51616790e-04 1.01323580e-03 4.82436067e-04 2.26818313e-04
 2.48678429e-04 2.92580924e-04 6.08695364e-04 6.69979925e-04
 1.48093046e-03 6.54341681e-04 2.67566250e-03 3.57095693e-03
 1.52421210e-03 7.21659372e-03 6.21080540e-03 5.46077291e-04
 7.59012279e-04 6.05378517e-04 2.07205505e-04 3.84932575e-04
 3.37523022e-04 9.79281668e-04 9.79613307e-05 3.93617891e-02
 5.93900927e-03 1.73181370e-03 3.55325317e-04 2.04687645e-04
 3.04399593e-04 0.00000000e+00 6.77129506e-04 1.90528209e-03
 4.56492454e-04 1.88268646e-04 2.59422316e-04 6.77067287e-03
 1.50741387e-03 6.461279

In [37]:
from sklearn.metrics import accuracy_score, roc_curve, auc


    

# imputation of missing values by substituting -1 for compatibility with RF implementation
test.fillna(-1,inplace=True)

Y_hat = clf.predict(test)
acc = accuracy_score(test_Y,Y_hat)

scores = pd.DataFrame(clf.predict_proba(test))
scores.columns = ['benign','malware']

#print(len(scores['malware']))
fpr, tpr, th = roc_curve(test_Y, scores['malware'], pos_label='malware')

In [39]:
def compute_partial_auc(fpr, tpr, fpr_max):
    print(type(fpr))
    partial_fpr = [fpr[i] for i in range(len(fpr)) if fpr[i] <= fpr_max]
    partial_tpr = [tpr[i] for i in range(len(fpr)) if fpr[i] <= fpr_max]
    partial_fpr = partial_fpr + [fpr_max]
    partial_tpr = partial_tpr + [tpr[-1]]
    partial_auc = (auc(partial_fpr, partial_tpr) / (fpr_max))
    #print(partial_fpr)
    return partial_auc

In [40]:
import matplotlib.pyplot as plt

def plot_ROC(fpr, tpr, fpr_max=0.01, title='ROC curves'):
    plt.figure()
    lw = 3

    plt.plot([0, 100], [100, 100], color='lightgray', lw=1, linestyle='--')
    plt.plot([0, 100], [90, 90], color='lightgray', lw=1, linestyle='--')
    plt.plot([0, 100], [80, 80], color='lightgray', lw=1, linestyle='--')
    plt.plot([1.0, 1.0], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.5, 0.5], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.2, 0.2], [50, 105], color='lightgray', lw=1, linestyle='--')
    plt.plot([0.1, 0.1], [50, 105], color='lightgray', lw=1, linestyle='--')

    pauc = compute_partial_auc(fpr, tpr, fpr_max)
    plt.plot(fpr*100, tpr*100, color='blue', lw=lw, label='AUC = %.2f, PAUC = %.4f' % (auc(fpr, tpr), pauc))

    plt.xlim([0.0, fpr_max*100])
    plt.ylim([50, 105])
    plt.xlabel('False Positive (%)')
    plt.ylabel('True Positives (%)')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig('data/no_type-rm_missing-rm_total.pdf')
    #plt.savefig('data/all_features.pdf')

    plt.show()
   
    
print(fpr)    
plot_ROC(fpr,tpr)

[0.00000000e+00 5.38560965e-05 7.18081287e-05 8.97601609e-05
 8.97601609e-05 1.61568290e-04 1.61568290e-04 1.61568290e-04
 1.61568290e-04 1.61568290e-04 1.61568290e-04 1.79520322e-04
 1.79520322e-04 1.97472354e-04 1.97472354e-04 1.97472354e-04
 2.15424386e-04 2.51328450e-04 2.69280483e-04 2.87232515e-04
 3.41088611e-04 3.59040643e-04 3.76992676e-04 3.94944708e-04
 4.12896740e-04 4.30848772e-04 4.30848772e-04 4.30848772e-04
 4.30848772e-04 4.84704869e-04 5.02656901e-04 5.56512997e-04
 5.92417062e-04 6.10369094e-04 6.10369094e-04 6.28321126e-04
 6.46273158e-04 7.36033319e-04 7.36033319e-04 7.53985351e-04
 8.25793480e-04 8.61697544e-04 8.79649576e-04 9.51457705e-04
 9.87361769e-04 1.00531380e-03 1.04121787e-03 1.05916990e-03
 1.07712193e-03 1.09507396e-03 1.11302599e-03 1.13097803e-03
 1.18483412e-03 1.20278616e-03 1.20278616e-03 1.20278616e-03
 1.23869022e-03 1.27459428e-03 1.29254632e-03 1.29254632e-03
 1.32845038e-03 1.38230648e-03 1.40025851e-03 1.47206664e-03
 1.49001867e-03 1.543874