# Main Romain

## Ideas

- A lot of unsuccesfull execution of task, might indicates 'trying to find a breach'

## Library Loading

In [1]:
## Classical libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC

## Loading the data

In [2]:
path = 'Data/train/0a8e69f80f39b18a78ca7B778a4efb029e7b42fbf.Zbot.xml'
tree = ET.parse(path)
root = tree.getroot()

## Features Engineering

In [3]:
def XML_get(root, element, attrib_name=''):
    res = []
    if attrib_name != '':
        for t in root.iter(element):
            try: 
                res.append(t.attrib[attrib_name])
            except:
                continue
                #res.append(None)
    else:
        for t in root.iter(element):
            try:
                res.append(t.attrib)
            except:
                continue
                #res.append(None)
    return res

### Get some potential features

    # Load DLL
load_dll_files              = XML_get(root, 'load_dll',   'filename')

    # VM_Protect
vm_protect_target           = XML_get(root, 'vm_protect', 'target')
vm_protect_protect          = XML_get(root, 'vm_protect', 'protect')
vm_protect_behavior         = XML_get(root, 'vm_protect', 'behavior')

    # Open Key
open_key_key                = XML_get(root, 'open_key', 'key')

    # Process
process_filename            = XML_get(root, 'process', 'filename')
process_filesize            = XML_get(root, 'process', 'filesize')
process_username            = XML_get(root, 'process', 'username')
process_applicationtype     = XML_get(root, 'process', 'applicationtype')
process_terminationreason   = XML_get(root, 'process', 'terminationreason')

    # Set Files Attributes
set_file_attributes_srcfile = XML_get(root, 'set_file_attributes', 'srcfile')

    # Open File
open_file_filetype      = XML_get(root, 'open_file', 'filetype')
open_file_srcfile       = XML_get(root, 'open_file', 'srcfile')
open_file_desiredaccess = XML_get(root, 'open_file', 'desiredaccess')

    # Find the successes
successes = []
for t in root.findall(".//*[@successful]"):
    successes.append(int(t.attrib['successful']))
success_ratio = sum(successes) / float(len(successes))

    # Kill Process
kill_process = XML_get(root, 'kill_process', 'apifunction')


def createFeatures(root):
    load_dll_files = XML_get(root, 'load_dll',   'filename')
    load_dll_files = [i.replace('\\', ' ') for i in load_dll_files]
    vm_protect_target = XML_get(root, 'vm_protect', 'target')
    vm_protect_protect = XML_get(root, 'vm_protect', 'protect')
    vm_protect_behavior = XML_get(root, 'vm_protect', 'behavior')
    open_key_key = XML_get(root, 'open_key', 'key')
    
    
    res = np.concatenate([load_dll_files,
                          vm_protect_target,
                          vm_protect_protect,
                          vm_protect_behavior,
                          open_key_key
                         ])
    
    return res

##### Find all the tags 

In [4]:
all_tags = []
for t in root.iter('all_section'):
    for child in t.getchildren():
        all_tags.append(child.tag)


In [None]:
set(all_tags).head(2)

## Adding Features to the New Model

In [5]:
path = 'Data/train/0a8e69f80f39b18a78ca7B778a4efb029e7b42fbf.Zbot.xml'
tree = ET.parse(path)
root = tree.getroot()

##### Loop on the data and create the features

In [18]:
ids_classes = []
trees = []
for fname in os.listdir('Data/train')[:300]:
    if fname == '.DS_Store':
        continue
    id_str, clazz = fname.split('.')[:2]
    ids_classes.append((id_str, clazz))
    tree = ET.parse(os.path.join('Data/train', fname))
    trees.append(tree)

train_df = pd.DataFrame.from_records(ids_classes, columns=['id','class']) 

In [19]:
not_calls = ['processes','all_section','thread','process']
docs = []
for tree in trees:
    calls = []
    for ele in tree.iter():
        if ele.tag not in not_calls:
            calls.append(ele.tag)   
    docs.append(calls)

##### Adding the new features

In [20]:
for i in range(0,len(trees)):
    docs[i].extend(createFeatures(trees[i].getroot()))

##### y management

In [21]:
y = train_df['class'].values
def to_2class(classes):
    return ['None' if label == 'None' else 'Mal' for label in classes] 

##### TFIDF Computation

In [22]:
vectorizer = TfidfVectorizer(ngram_range=(1,10))
tfidf = vectorizer.fit_transform([' '.join(doc) for doc in docs])

##### Construct X

In [23]:
X = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())

##### Fit Function

In [24]:
def classify_and_score(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf.fit(X_train, y_train)
    train_preds = clf.predict(X_train)
    print 'train accuracy: ' + str(metrics.accuracy_score(y_train, train_preds))
    test_preds = clf.predict(X_test)
    print 'validation accuracy: ' + str(metrics.accuracy_score(y_test, test_preds))
    return X_train, X_test, y_train, y_test

##### Logistic Regression

In [25]:
lr = LogisticRegression()
_ = classify_and_score(lr, X, y)

train accuracy: 0.773333333333
validation accuracy: 0.786666666667


##### Random Forest

In [26]:
rfc = RFC(n_estimators=50)
_ = classify_and_score(rfc, X, y)

train accuracy: 0.995555555556
validation accuracy: 0.786666666667


## Including Prediction

In [87]:
ids_classes = []
trees = []
for fname in os.listdir('Data/test'):
    if fname == '.DS_Store':
        continue
    id_str, clazz = fname.split('.')[:2]
    ids_classes.append((id_str, clazz))
    tree = ET.parse(os.path.join('Data/test', fname))
    trees.append(tree)
    
    
for fname in os.listdir('Data/train')[:150]:
    if fname == '.DS_Store':
        continue
    id_str, clazz = fname.split('.')[:2]
    ids_classes.append((id_str, clazz))
    tree = ET.parse(os.path.join('Data/train', fname))
    trees.append(tree)


df = pd.DataFrame.from_records(ids_classes, columns=['id','class']) 

In [88]:
not_calls = ['processes','all_section','thread','process']
docs = []
for tree in trees:
    calls = []
    for ele in tree.iter():
        if ele.tag not in not_calls:
            calls.append(ele.tag)   
    docs.append(calls)

In [None]:
for i in range(0,len(trees)):
    docs[i].extend(createFeatures(trees[i].getroot()))

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,10))
tfidf = vectorizer.fit_transform([' '.join(doc) for doc in docs])
X = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())

In [None]:
test_ix  = (df['class'] == 'X')
train_ix = (df['class'] != 'X')

##### Compute y

In [None]:
y = df['class'].values

##### RFC

In [None]:
rfc = RFC(n_estimators=50)
_ = classify_and_score(rfc, X.loc[train_ix], y[train_ix.values])

In [None]:
temp = rfc.predict(X.loc[test_ix])

In [None]:
temp.to_csv('temp.csv')