In [14]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
def read_files(dirname):
    ids_classes = []
    trees = []
    for fname in os.listdir(dirname):
        if fname == '.DS_Store':
            continue
        id_str, clazz = fname.split('.')[:2]
        ids_classes.append((id_str, clazz))
        tree = ET.parse(os.path.join(dirname, fname))
        trees.append(tree)
    return ids_classes, trees

In [3]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

In [4]:
train_ids_classes, train_trees = read_files('train')
test_ids_classes, test_trees = read_files('test')

train_df = pd.DataFrame.from_records(train_ids_classes, columns=['id','class']) 
y_train = train_df['class'].values

## Features

In [39]:
def get_features(tree, small=True):
    not_calls = ['processes','all_section','thread','process']
    feats = []

    for ele in tree.iter():
        if ele.tag not in not_calls:
            feats.append(ele.tag)
            for attr in ['filename', 'srcfile']:
                try:
                    filename = ele.attrib[attr].split('\\')[-1].replace('.','_')
                    if filename:                                         
                        feats.append(filename)
                except:
                    pass
            if small==False:
                for attr in ['desiredaccess','shareaccess','flags','apifunction']:
                    try:
                         feats.append(ele.attrib[attr].replace('.','_').replace(' ','_'))
                    except:
                        pass
    return feats

In [40]:
train_features =[]
for tree in train_trees:
    features = get_features(tree, small=False)
    train_features.append(' '.join(features))

In [41]:
vectorizer = TfidfVectorizer(ngram_range=(1,4))
vectorizer.fit(train_features)
X_train = vectorizer.transform(train_features)

In [42]:
def create_split_mask(n_samples, train_size=0.8):
    train, test = train_test_split(range(n_samples), train_size=train_size)
    mask = np.ones(n_samples, dtype='int')
    mask[train] = 1
    mask[test] = 0
    mask = (mask==1)
    return mask
mask = create_split_mask(X_train.shape[0])

## Functions

In [43]:
def classify_and_score(clf, X, y, mask):
    X_train = X[mask]
    X_val = X[~mask]
    y_train = y[mask]
    y_val = y[~mask]
    clf.fit(X_train, y_train)
    train_preds = clf.predict(X_train)
    print 'train accuracy: ' + str(metrics.accuracy_score(y_train, train_preds))
    val_preds = clf.predict(X_val)
    print 'validation accuracy: ' + str(metrics.accuracy_score(y_val, val_preds))

In [44]:
def cv_optimize(clf ,params, X, y,  n_folds=3):
    gs = GridSearchCV(clf, param_grid=params, cv=n_folds)
    gs.fit(X, y)
    return gs.best_estimator_,gs.best_params_, gs.best_score_, gs.grid_scores_

## Random Forest

### Feature Selection

In [58]:
rfc_selector = RFC(n_estimators=100)
classify_and_score(rfc_selector, X_train, y_train, mask)

train accuracy: 1.0
validation accuracy: 0.894822006472


In [70]:
from sklearn.feature_selection import SelectFromModel
feature_selector = SelectFromModel(rfc_selector, prefit=True, threshold=0.0001)
X_trans = feature_selector.transform(X_train)
rfc = RFC(n_estimators=100)
classify_and_score(rfc, X_trans, y_train, mask)

train accuracy: 0.997974068071
validation accuracy: 0.902912621359


In [61]:
X_trans.shape

(3086, 1650)

In [62]:
feature_selector = SelectFromModel(rfc_selector, prefit=True, threshold=0.00001)
X_trans2 = feature_selector.transform(X_train)
classify_and_score(rfc, X_trans2, y_train, mask)

train accuracy: 1.0
validation accuracy: 0.906148867314


In [63]:
feature_selector = SelectFromModel(rfc_selector, prefit=True, threshold=0.001)
X_trans3 = feature_selector.transform(X_train)
classify_and_score(rfc, X_trans3, y_train, mask)

train accuracy: 0.9943273906
validation accuracy: 0.902912621359


### Gridsearch

In [64]:
rfc = RFC()
params = {'n_estimators':[50, 100, 250, 500]}
results = cv_optimize(rfc, params, X_trans, y_train)
results

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 {'n_estimators': 100},
 0.89695398574206087,
 [mean: 0.89404, std: 0.00592, params: {'n_estimators': 50},
  mean: 0.89695, std: 0.00651, params: {'n_estimators': 100},
  mean: 0.89598, std: 0.00598, params: {'n_estimators': 250},
  mean: 0.89533, std: 0.00727, params: {'n_estimators': 500}])

## MLP

In [87]:
mlp = MLPClassifier(max_iter=1000, verbose=True)
classify_and_score(mlp, X_trans, y_train, mask)

Iteration 1, loss = 2.61591312
Iteration 2, loss = 2.26723898
Iteration 3, loss = 1.82939342
Iteration 4, loss = 1.42971106
Iteration 5, loss = 1.20282373
Iteration 6, loss = 1.08648334
Iteration 7, loss = 1.00860293
Iteration 8, loss = 0.94909420
Iteration 9, loss = 0.90094043
Iteration 10, loss = 0.85999568
Iteration 11, loss = 0.82476623
Iteration 12, loss = 0.79306711
Iteration 13, loss = 0.76562616
Iteration 14, loss = 0.74058305
Iteration 15, loss = 0.71899897
Iteration 16, loss = 0.69807836
Iteration 17, loss = 0.67913751
Iteration 18, loss = 0.66092971
Iteration 19, loss = 0.64364995
Iteration 20, loss = 0.62805008
Iteration 21, loss = 0.61273354
Iteration 22, loss = 0.59853544
Iteration 23, loss = 0.58523834
Iteration 24, loss = 0.57215127
Iteration 25, loss = 0.55989105
Iteration 26, loss = 0.54814922
Iteration 27, loss = 0.53746210
Iteration 28, loss = 0.52636727
Iteration 29, loss = 0.51627551
Iteration 30, loss = 0.50616252
Iteration 31, loss = 0.49658510
Iteration 32, los

## Ensemble

In [74]:
rfc = RFC(n_estimators=100, n_jobs=-1)
rfc.fit(X_trans[mask], y_trans[mask])
trainpreds_rfc = rfc.predict_proba(X_trans[mask])
trainpreds_mlp = mlp.predict_proba(X_trans[mask])
valpreds_rfc = rfc.predict_proba(X_trans[~mask])
valpreds_mlp = mlp.predict_proba(X_trans[~mask])

In [75]:
X_trainpreds = np.concatenate([trainpreds_rfc, trainpreds_mlp], axis=1)
stacked = LogisticRegression().fit(X_trainpreds, y_train[mask])

In [123]:
X_valpreds = np.concatenate([valpreds_rfc, valpreds_mlp], axis=1)
stacked.score(X_valpreds, y_train[~mask])

## Submission

In [64]:
# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

In [55]:
rfc = RFC(n_estimators=200)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
mlp = MLPClassifier(max_iter=1000)
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', algorithm='adam', alpha=0.0001,
       batch_size=200, beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [57]:
preds_rfc = rfc.predict_proba(X_train)
preds_mlp = mlp.predict_proba(X_train)
X_preds = np.concatenate([preds_rfc, preds_mlp], axis=1)
stacked = LogisticRegression().fit(X_preds, y_train)

In [58]:
test_features =[]
for tree in test_trees:
    features = get_features(tree, only_calls=True)
    test_features.append(' '.join(features) )    

In [59]:
X_test = vectorizer.transform(test_features)

In [60]:
testpreds_rfc = rfc.predict_proba(X_test)
testpreds_mlp = mlp.predict_proba(X_test)
X_testpreds = np.concatenate([testpreds_rfc, testpreds_mlp], axis=1)

In [61]:
testpreds = [malware_classes.index(y_hat) for y_hat in stacked.predict(X_testpreds)]

In [62]:
testids = [id_class[0] for id_class in test_ids_classes]

In [65]:
write_predictions(testpreds, testids, 'predictions2')