# Main Romain

## Ideas

- A lot of unsuccesfull execution of task, might indicates 'trying to find a breach'

## Library Loading

In [1]:
## Classical libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.cross_validation import cross_val_score
# from sklearn.cross_validation import train_test_split
# from sklearn.metrics import mean_squared_error

In [2]:
## Specific libraries

In [3]:
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

## Loading the data

In [4]:
path = 'Data/train/0a8e69f80f39b18a78ca7B778a4efb029e7b42fbf.Zbot.xml'
tree = ET.parse(path)
root = tree.getroot()

## Features Engineering

In [5]:
def XML_get(root, element, attrib_name=''):
    res = []
    if attrib_name != '':
        for t in root.iter(element):
            try: 
                res.append(t.attrib[attrib_name])
            except:
                res.append(None)
    else:
        for t in root.iter(element):
            try:
                res.append(t.attrib)
            except:
                res.append(None)
    return res

### Get some potential features

    # Load DLL
load_dll_files              = XML_get(root, 'load_dll',   'filename')

    # VM_Protect
vm_protect_target           = XML_get(root, 'vm_protect', 'target')
vm_protect_protect          = XML_get(root, 'vm_protect', 'protect')
vm_protect_behavior         = XML_get(root, 'vm_protect', 'behavior')

    # Open Key
open_key_key                = XML_get(root, 'open_key', 'key')

    # Process
process_filename            = XML_get(root, 'process', 'filename')
process_filesize            = XML_get(root, 'process', 'filesize')
process_username            = XML_get(root, 'process', 'username')
process_applicationtype     = XML_get(root, 'process', 'applicationtype')
process_terminationreason   = XML_get(root, 'process', 'terminationreason')

    # Set Files Attributes
set_file_attributes_srcfile = XML_get(root, 'set_file_attributes', 'srcfile')

    # Open File
open_file_filetype      = XML_get(root, 'open_file', 'filetype')
open_file_srcfile       = XML_get(root, 'open_file', 'srcfile')
open_file_desiredaccess = XML_get(root, 'open_file', 'desiredaccess')

    # Find the successes
successes = []
for t in root.findall(".//*[@successful]"):
    successes.append(int(t.attrib['successful']))
success_ratio = sum(successes) / float(len(successes))

    # Kill Process
kill_process = XML_get(root, 'kill_process', 'apifunction')




In [6]:
kill_process

['NtTerminateProcess']

##### Find all the tags 

In [7]:
all_tags = []
for t in root.iter('all_section'):
    for child in t.getchildren():
        all_tags.append(child.tag)


In [8]:
set(all_tags)

{'bind_socket',
 'check_for_debugger',
 'com_create_instance',
 'connect',
 'copy_file',
 'create_interface',
 'create_key',
 'create_mutex',
 'create_open_file',
 'create_socket',
 'create_thread',
 'create_thread_remote',
 'delete_file',
 'enum_keys',
 'enum_modules',
 'enum_processes',
 'enum_types',
 'enum_values',
 'find_file',
 'get_computer_name',
 'get_file_attributes',
 'get_host_by_name',
 'get_system_directory',
 'get_system_time',
 'get_username',
 'impersonate_user',
 'kill_process',
 'listen_socket',
 'load_dll',
 'load_image',
 'move_file',
 'open_file',
 'open_key',
 'open_mutex',
 'open_process',
 'open_scmanager',
 'open_service',
 'open_url',
 'query_value',
 'revert_to_self',
 'set_file_attributes',
 'set_file_time',
 'set_value',
 'set_windows_hook',
 'sleep',
 'vm_allocate',
 'vm_protect',
 'vm_write'}

## Trying to add a new single feature into Mark's Model

In [None]:


path = 'Data/train/0a8e69f80f39b18a78ca7B778a4efb029e7b42fbf.Zbot.xml'
tree = ET.parse(path)
root = tree.getroot()

## Mark EXAMPLE

In [8]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
# these are the fifteen malware classes we're looking for
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

In [12]:
import pandas as pd

ids_classes = []
trees = []
for fname in os.listdir('Data/train')[:100]:
    if fname == '.DS_Store':
        continue
    id_str, clazz = fname.split('.')[:2]
    ids_classes.append((id_str, clazz))
    tree = ET.parse(os.path.join('Data/train', fname))
    trees.append(tree)

train_df = pd.DataFrame.from_records(ids_classes, columns=['id','class']) 

In [17]:
y = train_df['class'].values

In [16]:
def to_2class(classes):
    return ['None' if label == 'None' else 'Mal' for label in classes] 

In [18]:
y

array(['None', 'Lipler', 'VB', 'None', 'Swizzor', 'Zbot', 'None', 'None',
       'None', 'None', 'None', 'None', 'None', 'Zbot', 'Swizzor', 'Virut',
       'None', 'None', 'None', 'Virut', 'Swizzor', 'VB', 'Agent', 'None',
       'Swizzor', 'Swizzor', 'None', 'None', 'Agent', 'Swizzor', 'Tdss',
       'Agent', 'None', 'Magania', 'VB', 'None', 'None', 'None', 'None',
       'None', 'VB', 'AutoRun', 'Swizzor', 'VB', 'None', 'AutoRun',
       'Agent', 'None', 'None', 'Swizzor', 'None', 'Agent', 'None', 'Krap',
       'Swizzor', 'FraudPack', 'None', 'Zbot', 'None', 'None', 'None',
       'VB', 'FraudPack', 'None', 'None', 'None', 'Agent', 'None', 'None',
       'None', 'Virut', 'None', 'VB', 'None', 'None', 'None', 'Swizzor',
       'Swizzor', 'None', 'Swizzor', 'None', 'None', 'VB', 'None', 'None',
       'VB', 'Swizzor', 'None', 'None', 'FraudLoad', 'None', 'None',
       'None', 'Swizzor', 'None', 'Swizzor', 'Swizzor', 'Swizzor', 'None',
       'None'], dtype=object)

In [19]:
not_calls = ['processes','all_section','thread','process']
docs = []
for tree in trees:
    calls = []
    for ele in tree.iter():
        if ele.tag not in not_calls:
            calls.append(ele.tag)   
    docs.append(calls)

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
tfidf = vectorizer.fit_transform([' '.join(doc) for doc in docs])

In [24]:
X = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())

In [26]:
lr = LogisticRegression()

In [27]:
def classify_and_score(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf.fit(X_train, y_train)
    train_preds = clf.predict(X_train)
    print 'train accuracy: ' + str(metrics.accuracy_score(y_train, train_preds))
    test_preds = clf.predict(X_test)
    print 'validation accuracy: ' + str(metrics.accuracy_score(y_test, test_preds))
    return X_train, X_test, y_train, y_test

In [28]:
_ = classify_and_score(lr, X, y)

train accuracy: 0.76
validation accuracy: 0.68
