In [1]:
import numpy as np
import pandas as pd
import pandasql as pdsql
from datetime import datetime
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.ensemble import *
from sklearn.metrics import accuracy_score, roc_curve, auc



pysql = lambda q: pdsql.sqldf(q, globals())

features = pd.read_csv('data/amico-features-export.csv.gz', compression='gzip')
features.sort_values(by=['dump_id'],inplace=True)
features.set_index(features['dump_id'],inplace=True)

metadata = pd.read_csv('data/amico-export.csv.gz', compression='gzip')
metadata.sort_values(by=['dump_id'],inplace=True)
metadata['date'] = pd.to_datetime(metadata['date'])
metadata = metadata[metadata['dump_id'].isin(features['dump_id'])]
metadata.set_index(metadata['dump_id'],inplace=True)

print ("Number of rows in metadata =", len(metadata))
print ("Number of rows in features =", len(features))

Number of rows in metadata = 121545
Number of rows in features = 121545


In [2]:
data = features.join(metadata[['dump_id','date','md5','host','type','max_tavs','max_avs','score']], how='inner', rsuffix='_d')
data.sort_values(by=['dump_id'],inplace=True)

print ("Number of rows in data after join =", len(data))

Number of rows in data after join = 121545


In [3]:
dataset = data.copy()

In [19]:
trainining_end_date = '2017-04-01'

training_data = dataset[dataset['date']<trainining_end_date]
test_data = dataset[dataset['date']>=trainining_end_date]

In [20]:
def label_downloads(avs_count, avs_count1, threshold):
    avs_count = list(avs_count)
    avs_count1 = list(avs_count1)
    labels = ['benign']*len(avs_count)
    for i in range(len(avs_count)):
        if avs_count[i] >= threshold : 
            labels[i] = 'malware'
        elif ((avs_count1[i] > 0) and (avs_count[i] < 2)) :
            labels[i] = 'unknown'
    return labels

In [21]:
#Training data information
print ("Number of training data rows =", len(training_data))
missing_train = training_data.max_avs.isnull().sum()
print("Missing AV labels for training data = ",missing_train)

training_data = training_data.loc[~training_data['max_avs'].isnull()]
print ("Number of remaining rows after removing missing labels =", len(training_data))

threshold = 2
avs_count = training_data['max_tavs']
avs_count1 = training_data['max_avs']
training_data['avs5'] = label_downloads(avs_count, avs_count1, threshold)

training_data1 = training_data.loc[training_data['avs5'] != 'unknown']
print ("Number of remaining row without unknown (final training set) =", len(training_data1))

labels_count = Counter(list(training_data['avs5']))
print ("Training labels count:", labels_count)


#Test data information
print ("\n\nNumber of test data rows =", len(test_data))
missing_test = test_data.max_avs.isnull().sum()
print("Missing AV labels for test data = ",missing_test)

test_data = test_data.loc[~test_data['max_avs'].isnull()]
print ("Number of remaining rows after removing missing labels =", len(test_data))

threshold = 2
avs_count = test_data['max_tavs']
avs_count1 = test_data['max_avs']
test_data['avs5'] = label_downloads(avs_count, avs_count1, threshold)

test_data1 = test_data.loc[test_data['avs5'] != 'unknown']
print ("Number of remaining rows without unknown (final test set) =", len(test_data1))

labels_count = Counter(list(test_data['avs5']))
print ("Test labels count:", labels_count)

Number of training data rows = 44305
Missing AV labels for training data =  4157
Number of remaining rows after removing missing labels = 40148
Number of remaining row without unknown (final training set) = 36451
Training labels count: Counter({'benign': 34443, 'unknown': 3697, 'malware': 2008})


Number of test data rows = 77240
Missing AV labels for test data =  4916
Number of remaining rows after removig missing labels = 72324
Number of remaining rows without unknown (final test set) = 58248
Test labels count: Counter({'benign': 55705, 'unknown': 14076, 'malware': 2543})
