In [205]:
import numpy as np

In [207]:
## This file provides starter code for extracting features from the xml files and
## for doing some learning.
##
## The basic set-up: 
## ----------------
## main() will run code to extract features, learn, and make predictions.
## 
## extract_feats() is called by main(), and it will iterate through the 
## train/test directories and parse each xml file into an xml.etree.ElementTree, 
## which is a standard python object used to represent an xml file in memory.
## (More information about xml.etree.ElementTree objects can be found here:
## http://docs.python.org/2/library/xml.etree.elementtree.html
## and here: http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/)
## It will then use a series of "feature-functions" that you will write/modify
## in order to extract dictionaries of features from each ElementTree object.
## Finally, it will produce an N x D sparse design matrix containing the union
## of the features contained in the dictionaries produced by your "feature-functions."
## This matrix can then be plugged into your learning algorithm.
##
## The learning and prediction parts of main() are largely left to you, though
## it does contain code that randomly picks class-specific weights and predicts
## the class with the weights that give the highest score. If your prediction
## algorithm involves class-specific weights, you should, of course, learn 
## these class-specific weights in a more intelligent way.
##
## Feature-functions:
## --------------------
## "feature-functions" are functions that take an ElementTree object representing
## an xml file (which contains, among other things, the sequence of system calls a
## piece of potential malware has made), and returns a dictionary mapping feature names to 
## their respective numeric values. 
## For instance, a simple feature-function might map a system call history to the
## dictionary {'first_call-load_image': 1}. This is a boolean feature indicating
## whether the first system call made by the executable was 'load_image'. 
## Real-valued or count-based features can of course also be defined in this way. 
## Because this feature-function will be run over ElementTree objects for each 
## software execution history instance, we will have the (different)
## feature values of this feature for each history, and these values will make up 
## one of the columns in our final design matrix.
## Of course, multiple features can be defined within a single dictionary, and in
## the end all the dictionaries returned by feature functions (for a particular
## training example) will be unioned, so we can collect all the feature values 
## associated with that particular instance.
##
## Two example feature-functions, first_last_system_call_feats() and 
## system_call_count_feats(), are defined below.
## The first of these functions indicates what the first and last system-calls 
## made by an executable are, and the second records the total number of system
## calls made by an executable.
##
## What you need to do:
## --------------------
## 1. Write new feature-functions (or modify the example feature-functions) to
## extract useful features for this prediction task.
## 2. Implement an algorithm to learn from the design matrix produced, and to
## make predictions on unseen data. Naive code for these two steps is provided
## below, and marked by TODOs.
##
## Computational Caveat
## --------------------
## Because the biggest of any of the xml files is only around 35MB, the code below 
## will parse an entire xml file and store it in memory, compute features, and
## then get rid of it before parsing the next one. Storing the biggest of the files 
## in memory should require at most 200MB or so, which should be no problem for
## reasonably modern laptops. If this is too much, however, you can lower the
## memory requirement by using ElementTree.iterparse(), which does parsing in
## a streaming way. See http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/
## for an example. 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util
import re
import feature

def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        #print(datafile.split('.')[:3])
        id_str,clazz = datafile.split('.')[:2]
        if(id_str == ''):
            #_,clazz, id_str = datafile.split('.')[:3]
            #clazz = re.sub('^[^A-Za-z]*', '', clazz)
            #print(clazz)
            #print(id_str)
            #print(clazz)
            print("dropped {0}".format(clazz))
            pass
        else:
            ids.append(id_str)
            # add target class if this is training data
            try:
                classes.append(util.malware_classes.index(clazz))
            except ValueError:
                # we should only fail to find the label in our list of malware classes
                # if this is test data, which always has an "X" label
                if clazz == "X":
                    classes.append(-1)
                else:
                    pass
            rowfd = {}
            # parse file as an xml document
            tree = ET.parse(os.path.join(direc,datafile))
            # accumulate features
            [rowfd.update(ff(tree)) for ff in ffs]
            fds.append(rowfd)
        
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids


def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
        
    cols = []
    rows = []
    data = []        
    for i in range(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].items():  #changed for python3
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict
    

## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

def system_call_count_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    return c

## The following function does the feature extraction, learning, and prediction
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument
    
    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats]
    
    # extract features
    print("extracting training features...")
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print ("done extracting training features")
    print()
    
    # TODO train here, and learn your classification parameters
    print ("learning...")
    learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    print("done learning")
    
    print()
    
    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print("extracting test features...")
    X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    print ("done extracting test features")
    print()

    # TODO make predictions on text data and write them out
    print ("making predictions...")
    preds = np.argmax(X_test.dot(learned_W),axis=1)
    print ("done making predictions")
    print()
    
    print ("writing predictions...")
    util.write_predictions(preds, test_ids, outputfile)
    print ("done!")

if __name__ == "__main__":
    #main()
    pass


In [208]:
clazz = '_ffdba6079b981688512353cF89ca7e1b8f4868263'

In [209]:
a = re.sub('^[^A-Za-z]*', '', clazz)

In [210]:
preds

array([10,  0, 12, ...,  8,  8, 10])

In [211]:
def system_call_feats(tree, num_fl=20):
    """
    arguments:
      tree is an xml.etree.ElementTree object
      num_fl is the number of first and last calls ot keep track of
    returns:
      a dictionary mapping features to counts
    """
    c = Counter()
    in_all_section = False
    last_calls = [None] * num_fl  # list initted with nones of the length of first and last calls to keep track of
    call_count = 0  # counts calls made in all_section
    out_of_all_count = 0  # counts call made out of all_section
    all_section_count = 0  # countsn umber of all_sections
    keywords = ['successful=\"0\"', 'successful=\"1\"', 'SECURITY_ANONYMOUS', 'FILE_ATTRIBUTE_NORMAL', 'error',
                'warning', 'FILE_ANY_ACCESS', 'FILE_READ_ACCESS', 'FILE_READ_DATA', 'FILE_LIST_DIRECTORY',
                'FILE_WRITE_ACCESS', 'FILE_WRITE_DATA', 'FILE_ADD_FILE']  # keywords to count inside attributes
    keyword_counts = {} #  a dict to store counts of keyword instances
    for k in keywords:  # fill in dict
        keyword_counts[k] = 0
    for el in tree.iter():
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
            all_section_count += 1  # update count for all_sections
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            call_count += 1  # update call count
            c[el.tag + '_call_count'] += 1  # update the count of this call
            if call_count <= num_fl:  # if one of the first, keep track of this call
                c["call_"+str(call_count)+'_' + el.tag] = 1
            last_calls.pop()  # keep track of last calls like a queue
            last_calls.insert(0, el.tag)  # keep track of last calls like a queue
            for k in keywords:  # check each attrib for keywords
                if k in el.attrib:  # if found, update keyword count
                    keyword_counts[k] += 1
        else:
            out_of_all_count += 1  # update count of calls out of all

    for i in range(num_fl):  # fill in counter values for last_calls from qeueu
        if last_calls[i] != None:  # omit nones
            c["call_-" + str(i) + "_" + last_calls[i]] = 1
    c['all_sections'] = all_section_count  # put in all_section count
    c["out_of_all_section_calls"] = out_of_all_count  # put in out of all section count
    for k in keywords:  # gill in all of the keyword counts 
        c[k + '_counts'] = keyword_counts[k]
    return c

In [212]:
train_dir = "train"
test_dir = "test"
outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [system_call_feats]

# extract features
print("extracting training features...")
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
print ("done extracting training features")
print()


extracting training features...
dropped _ffdba6079b981688512353cF89ca7e1b8f4868263
done extracting training features



In [None]:
global_feat_d

In [19]:
global_feat_dict

{'FILE_ADD_FILE_counts': 0,
 'FILE_ANY_ACCESS_counts': 1,
 'FILE_ATTRIBUTE_NORMAL_counts': 2,
 'FILE_LIST_DIRECTORY_counts': 3,
 'FILE_READ_ACCESS_counts': 4,
 'FILE_READ_DATA_counts': 5,
 'FILE_WRITE_ACCESS_counts': 6,
 'FILE_WRITE_DATA_counts': 7,
 'SECURITY_ANONYMOUS_counts': 8,
 'accept_socket_call_count': 9,
 'add_netjob_call_count': 10,
 'all_sections': 11,
 'bind_socket_call_count': 12,
 'call_-0_check_for_debugger': 13,
 'call_-0_create_mutex': 14,
 'call_-0_create_open_file': 15,
 'call_-0_create_thread': 16,
 'call_-0_create_window': 17,
 'call_-0_dump_line': 18,
 'call_-0_enum_window': 19,
 'call_-0_find_file': 20,
 'call_-0_get_file_attributes': 21,
 'call_-0_get_host_by_name': 22,
 'call_-0_get_system_directory': 23,
 'call_-0_get_system_time': 24,
 'call_-0_kill_process': 25,
 'call_-0_listen_socket': 26,
 'call_-0_load_dll': 27,
 'call_-0_load_driver': 28,
 'call_-0_open_file': 29,
 'call_-0_open_key': 30,
 'call_-0_open_process': 31,
 'call_-0_process': 32,
 'call_-0_qu

In [20]:
train_ids

['39a173fd60fc333e4ded8fa48b6ee92745e6f341a',
 '8d8bc32466b748d9c370ccf8ae0f9cf3b62279d32',
 '93a41f30e598e589f7c35668f17678b8630d75729',
 '7fe038ab0f8069d05abc2a5475c2d6a14def3f0ff',
 'c743e0bb40ebe97049c69d9fb108376Cc0e2667df',
 'a9595b920e3e39df465c0858b33be18de52969cb0',
 'c75a7faf9771ecd8f27dea372c654a6c1977aec9b',
 '979d4d7357c0e1947c8ca0912497bcf1cb4e8f38d',
 'a21a72945883651ee6d6ad656d15b5e289a841f9d',
 'be2a6706a62a206d60c4d699de178dfec191e5d0e',
 '22178c97cc9b217aa9cd8aa37c834244cc97588ba',
 '38fb014f1a712c6b13dba8050d2111de14eb2298e',
 'd5beda2266ccc97771ef233d6503c4d0823e850ea',
 'bc27db08619573aea8e2dc2945ff9b32cfdb6d35b',
 'd5d52142a64c2f713e04666d1d9cdb5239e606b97',
 '27af3871fdf1e573233299ad7c3b3915406689A37',
 'd2c7b0a4e94cbdaee46cb6ef466eaee9f20025Eb1',
 '6e1b97e35ac45c74a5f57fa7da135153ea8df840d',
 'ba2b7bfcbce9eeBe784616df4f0cd81be198e1719',
 '863aa7645066bd2fb96ad8d8E49256880bbb6b7a1',
 'cf48fcceb087933c5337f76ba25fdbcb75eaec35e',
 'a683fd7C4e41888e7697b20b6e8096b8

In [21]:
t_train

array([ 8,  8,  8, ...,  8, 10,  5])

In [22]:
global_feat_dict

{'FILE_ADD_FILE_counts': 0,
 'FILE_ANY_ACCESS_counts': 1,
 'FILE_ATTRIBUTE_NORMAL_counts': 2,
 'FILE_LIST_DIRECTORY_counts': 3,
 'FILE_READ_ACCESS_counts': 4,
 'FILE_READ_DATA_counts': 5,
 'FILE_WRITE_ACCESS_counts': 6,
 'FILE_WRITE_DATA_counts': 7,
 'SECURITY_ANONYMOUS_counts': 8,
 'accept_socket_call_count': 9,
 'add_netjob_call_count': 10,
 'all_sections': 11,
 'bind_socket_call_count': 12,
 'call_-0_check_for_debugger': 13,
 'call_-0_create_mutex': 14,
 'call_-0_create_open_file': 15,
 'call_-0_create_thread': 16,
 'call_-0_create_window': 17,
 'call_-0_dump_line': 18,
 'call_-0_enum_window': 19,
 'call_-0_find_file': 20,
 'call_-0_get_file_attributes': 21,
 'call_-0_get_host_by_name': 22,
 'call_-0_get_system_directory': 23,
 'call_-0_get_system_time': 24,
 'call_-0_kill_process': 25,
 'call_-0_listen_socket': 26,
 'call_-0_load_dll': 27,
 'call_-0_load_driver': 28,
 'call_-0_open_file': 29,
 'call_-0_open_key': 30,
 'call_-0_open_process': 31,
 'call_-0_process': 32,
 'call_-0_qu

In [16]:
# TODO train here, and learn your classification parameters
#print ("learning...")
#learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
#print("done learning")

#print()

# get rid of training data and load test data
#del X_train
#del t_train
#del train_ids
#print("extracting test features...")
#X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
#print ("done extracting test features")
#print()

# TODO make predictions on text data and write them out
print ("making predictions...")
preds = np.argmax(X_test.dot(learned_W),axis=1)
print ("done making predictions")
print()

print ("writing predictions...")
util.write_predictions(preds, test_ids, outputfile)
print ("done!")

learning...
done learning

extracting test features...
dropped _00ab51519cdf57cf9a53aa4e0caeada0753a36bd4
done extracting test features

making predictions...
done making predictions

writing predictions...
done!


In [61]:
import torch
import torchvision
import pandas as pd
from sklearn.preprocessing import StandardScaler 

In [71]:
scaler = StandardScaler()
X_train_array = X_train.toarray()

X_test_array = X_test.toarray()

In [118]:
#X_tr = pd.DataFrame(X_train_array)
scaled_data = scaler.fit_transform(X_train.toarray())
df = pd.DataFrame(scaled_data)
df["class"] = t_train

#X_tr = pd.DataFrame(X_train_array)
scaled_data = scaler.fit_transform(X_test.toarray())
test_df = pd.DataFrame(scaled_data)



In [124]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,631,632,633,634,635,636,637,638,639,class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.021916,-0.018154,-0.178492,0.0,-0.052593,12
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.145945,-0.036026,0.149896,-0.025457,-0.086539,-0.018154,0.128136,0.0,-0.052593,10
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.103216,-0.018154,-0.178492,0.0,-0.052593,12
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8


In [123]:
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,630,631,632,633,634,635,636,637,638,639
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,1.043273,0.393061,-0.016389,1.082617,-0.040143,1.071744,-0.032503,0.980491,0.0,-0.056456
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.381044,-0.069541,-0.016389,-0.144293,-0.040143,-0.075733,-0.032503,-0.042041,0.0,-0.056456
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,1.043273,-0.069541,-0.016389,-0.144293,-0.040143,-0.208826,-0.032503,-0.155656,0.0,-0.056456
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.510528,-0.069541,-0.016389,-0.144293,-0.040143,-0.244797,-0.032503,-0.155656,0.0,-0.056456
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.510528,-0.069541,-0.016389,-0.144293,-0.040143,-0.244797,-0.032503,-0.155656,0.0,-0.056456
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.251561,-0.069541,-0.016389,-0.144293,-0.040143,1.755194,-0.032503,-0.155656,0.0,-0.056456
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.510528,-0.069541,-0.016389,-0.144293,-0.040143,-0.244797,-0.032503,-0.155656,0.0,-0.056456
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,0.784306,-0.066852,-0.016389,-0.144293,-0.040143,0.837932,-0.032503,-0.155656,0.0,-0.056456
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,-0.381044,-0.069541,-0.016389,-0.144293,-0.040143,-0.244797,-0.032503,-0.155656,0.0,-0.056456
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016389,...,2.726558,0.242446,-0.016389,-0.144293,-0.040143,0.578940,-0.032503,-0.155656,0.0,-0.056456


In [119]:
df["class"]

0        8
1        8
2        8
3       12
4        8
5       10
6       12
7        8
8        8
9        8
10      10
11      10
12       8
13       8
14       8
15      10
16      10
17       8
18       8
19       3
20       8
21       8
22       8
23      12
24       8
25       8
26      13
27      12
28      12
29       8
        ..
3056    13
3057     8
3058     8
3059     0
3060     8
3061     8
3062    10
3063     8
3064    10
3065     8
3066    10
3067    13
3068     8
3069    10
3070     8
3071    14
3072    12
3073    12
3074     0
3075     8
3076    13
3077     8
3078     8
3079     8
3080     8
3081     8
3082     8
3083     8
3084    10
3085     5
Name: class, Length: 3086, dtype: int64

In [121]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,631,632,633,634,635,636,637,638,639,class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.021916,-0.018154,-0.178492,0.0,-0.052593,12
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.145945,-0.036026,0.149896,-0.025457,-0.086539,-0.018154,0.128136,0.0,-0.052593,10
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.103216,-0.018154,-0.178492,0.0,-0.052593,12
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02037,...,-0.182417,-0.036026,-0.163240,-0.025457,-0.065693,-0.018154,-0.178492,0.0,-0.052593,8


In [122]:
t_train

array([ 8,  8,  8, ...,  8, 10,  5])

In [59]:
X_tr.dtypes[X_tr.dtypes.values != 'int64']

Series([], dtype: object)

In [91]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [101]:
df["class"]

0        8
1        8
2        8
3       12
4        8
5       10
6       12
7        8
8        8
9        8
10      10
11      10
12       8
13       8
14       8
15      10
16      10
17       8
18       8
19       3
20       8
21       8
22       8
23      12
24       8
25       8
26      13
27      12
28      12
29       8
        ..
3056    13
3057     8
3058     8
3059     0
3060     8
3061     8
3062    10
3063     8
3064    10
3065     8
3066    10
3067    13
3068     8
3069    10
3070     8
3071    14
3072    12
3073    12
3074     0
3075     8
3076    13
3077     8
3078     8
3079     8
3080     8
3081     8
3082     8
3083     8
3084    10
3085     5
Name: class, Length: 3086, dtype: int64

In [151]:
from sklearn.model_selection import GridSearchCV
rfl = RandomForestClassifier()
parameters = {'n_estimators': [5, 10, 20, 30, 40, 50], 'max_depth': [2, 3, 4,5, 6, 7, 8,  9, 10],
'max_features' : [0.1, 0.2, 0.4, 0.6, 0.8, 1.], 'bootstrap': [True]}

rfl_grid = GridSearchCV(rfl, parameters, cv=6)
rfl_grid.fit(x_trains, y_trains)
rfl_grid.predict(x_vals)



array([ 8,  8, 10, 12, 12,  8,  8,  8, 12,  8,  8,  8,  8,  8,  8,  8,  3,
        8,  8,  8,  8, 12,  8,  8,  8,  8, 10, 10,  8,  8, 12, 10,  8, 10,
        8, 12, 10, 10,  6,  8, 12,  8,  7, 12, 12, 12, 10,  8,  5,  8,  8,
       14,  0, 10,  0,  0,  8,  8,  8,  8,  8,  8,  8,  8,  1, 13,  8,  8,
        8,  8, 12, 12,  8,  8, 10,  8,  8,  8,  8, 10,  8,  8, 10, 10,  8,
        8,  0,  9,  8,  8,  8, 10, 10,  8,  8,  8,  6,  0,  8,  8,  8, 10,
        8,  8, 12, 10, 12,  8,  3,  8,  8,  6,  8,  6,  8,  8, 12,  8,  8,
        8, 12,  8,  8, 12, 10,  8,  8,  8, 10,  8,  8,  8,  8,  8,  4,  8,
       10,  8,  8,  8,  8,  8,  8,  1,  1,  8,  8, 12, 10,  8, 12,  8, 10,
        8,  8, 10,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 12,  8,
       10, 12, 10,  8, 12,  8, 10, 10,  8,  5,  8,  8, 12,  8,  8,  8,  6,
        8,  6, 10,  8, 10,  8,  0, 10, 10,  1,  8,  8, 10,  8,  8,  8,  1,
       12, 10,  8,  8,  6, 10, 12, 10,  8,  0,  8, 12,  8,  8,  8,  8,  8,
        0, 10, 10,  8, 10

In [173]:
# FROM GRID SEARCH
'''<bound method BaseEstimator.get_params of GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [5, 10, 20, 30, 40, 50], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_features': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)>'''

#Best Estimator
'''RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)'''

rfl_grid.predict(x_vals)
print("\033[1mTest:\033[0m {0: .4f}".format(np.mean(rfl_grid.predict(x_vals) == y_vals)))

rfl_grid.best_estimator_

[1mTest:[0m  0.8666


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [108]:
x_trains, x_vals, y_trains, y_vals = train_test_split(df.copy().drop(columns="class").values, df["class"])
# Create the model and fit the training data on it
#rf_model = RandomForestClassifier(n_estimators=25, max_depth=5).fit(X_train, y_train)

n_estimators = [5, 10, 20, 30, 40, 50]
max_depth = [2, 3, 4,5, 6, 7, 8,  9, 10]
max_features = [0.1, 0.2, 0.4, 0.6, 0.8, 1.]
rf_model = RandomForestClassifier(n_estimators=25, max_depth=5).fit(x_trains, y_trains, bootstrap=True)
# Print the accuracy scores:
print("\033[1mAccuracy Score\033[0m")
print("\033[1mTest:\033[0m {0: .4f}".format(np.mean(rf_model.predict(x_vals) == y_vals)))

[1mAccuracy Score[0m
[1mTest:[0m  0.8446


In [204]:
df.shape

(3086, 641)

In [127]:
preds = rf_model.predict(test_df)
print ("writing predictions...")
util.write_predictions(preds, test_ids, outputfile)
print ("done!")

writing predictions...
done!


In [128]:
preds

array([10,  8,  8, ...,  8,  8, 10])

In [115]:
X_test

<3724x640 sparse matrix of type '<class 'numpy.int64'>'
	with 244452 stored elements in Compressed Sparse Row format>

In [161]:
preds2 = rf_model.predict(df.copy().drop(columns="class"))
preds3 = rfl_grid.predict(x_vals)

In [166]:
for i in range(15):
    print("{0}: {1}".format(i, (sum(preds3 == i)/len(preds3))))

0: 0.022020725388601035
1: 0.011658031088082901
2: 0.0038860103626943004
3: 0.009067357512953367
4: 0.0025906735751295338
5: 0.0051813471502590676
6: 0.022020725388601035
7: 0.009067357512953367
8: 0.6139896373056994
9: 0.0012953367875647669
10: 0.16839378238341968
11: 0.0038860103626943004
12: 0.11398963730569948
13: 0.0025906735751295338
14: 0.010362694300518135


In [170]:
preds = rfl_grid.predict(test_df)
print ("writing predictions...")
util.write_predictions(preds, test_ids, "grid_search_random_forest.csv")
print ("done!")

writing predictions...
done!


In [169]:
preds

array([10,  8, 12, ...,  8, 12, 10])

In [183]:

parameters_improved = {'n_estimators': [50, 100], 'max_depth': [10, 15, 20],
'max_features' : [0.01, 0.05, 0.08, 0.1], 'bootstrap': [True]}

#rename!!
rfl_grid = GridSearchCV(rfl, parameters_improved, cv=6)
rfl_grid.fit(x_trains, y_trains)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 100], 'max_depth': [10, 15, 20], 'max_features': [0.01, 0.05, 0.08, 0.1], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [185]:
# BEST ESTIMATOR
# N_FL 20
rfl_grid.best_estimator_
'''RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=0.08, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)'''

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=0.08, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [189]:
parameters_improved3 = {'n_estimators': [50, 60, 70], 'max_depth': [20, 25, 30],
'max_features' : [0.07, 0.08, 0.09], 'bootstrap': [True]}
rfl_grid3 = GridSearchCV(rfl, parameters_improved3, cv=6)
rfl_grid3.fit(x_trains, y_trains)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 60, 70], 'max_depth': [20, 25, 30], 'max_features': [0.07, 0.08, 0.09], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [190]:
rfl_grid3.best_estimator_
'''RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=0.08, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)'''

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=0.08, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [202]:
print("\033[1mTrain:\033[0m {0: .4f}".format(np.mean(rfl_grid.predict(x_trains) == y_trains)))
print("\033[1mTest:\033[0m {0: .4f}".format(np.mean(rfl_grid.predict(x_vals) == y_vals)))

[1mTrain:[0m  0.9909
[1mTest:[0m  0.8679


In [203]:
print("\033[1mTrain:\033[0m {0: .4f}".format(np.mean(rfl_grid3.predict(x_trains) == y_trains)))
print("\033[1mTest:\033[0m {0: .4f}".format(np.mean(rfl_grid3.predict(x_vals) == y_vals)))

[1mTrain:[0m  0.9909
[1mTest:[0m  0.8795


In [200]:
preds4 = rfl_grid3.predict(x_vals)
for i in range(15):
    print("{0}: {1}".format(i, (sum(preds4 == i)/len(preds4))))

0: 0.03367875647668394
1: 0.014248704663212436
2: 0.0051813471502590676
3: 0.011658031088082901
4: 0.0038860103626943004
5: 0.0051813471502590676
6: 0.022020725388601035
7: 0.012953367875647668
8: 0.5764248704663213
9: 0.0025906735751295338
10: 0.16968911917098445
11: 0.0051813471502590676
12: 0.11658031088082901
13: 0.006476683937823834
14: 0.014248704663212436


In [201]:
preds = rfl_grid3.predict(test_df)
print ("writing predictions...")
util.write_predictions(preds, test_ids, "grid_search_random_forest_updated.csv")
print ("done!")

writing predictions...
done!


In [199]:
OLSInfluence.summary_frame()

NameError: name 'OLSInfluence' is not defined

In [198]:
## COOKS DISTANCE
from statsmodels.regression import linear_model

#c_data = cooks_data.sort_values(by="cooks_d", ascending=False).copy()


#fig, ax = plt.subplots(1,1, figsize=(10,6))
                       

#ax.set_title("Cooks Distance vs Row")
#ax.scatter(cooks_data.index.values, cooks_data.cooks_d.values);

#fig.savefig("cooks_distance_pre.png")