# Microsoft Kaggle Classification Challenge

[Link](https://www.kaggle.com/c/malware-classification)

## 1. Read and Examine Data Characteristics


In [2]:
### Using pandas as Python Data Analysis Tools
import pandas as pd

submission_sample = pd.read_csv("./sampleSubmission/sampleSubmission.csv")

train_labels = pd.read_csv("./trainLabels/trainLabels.csv", header=None)


In [11]:
# submission_sample

In [21]:
train_labels[train_labels[1] == 1]

Unnamed: 0,0,1
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1
5,0AwWs42SUQ19mI7eDcTC,1
6,0cH8YeO15ZywEhPrJvmj,1
7,0DNVFKwYlcjO7bTfJ5p1,1
8,0DqUX5rkg3IbMY6BLGCE,1
9,0eaNKwluUmkYdIvZ923c,1



## 2. Feature Engineering

In [8]:
import re

# Utility functions
def get_fileID_from_file_path(path):
    '''Get input as path of .asm or .bytes files and
    
       Return the id

       E.g: get_fileID_from_file_path('home/livetolove128/01azqd4InC7m9JpocGv5.asm'): '01azqd4InC7m9JpocGv5'
    '''
    temp_str = re.search('.{20}\.(asm|bytes)', path).group(0)
    str_id = re.search('.{20}', temp_str).group(0)
    return str_id

def count_new_line_character(string):
    '''Get input as a string and
       
       Return the number of new line characters
       
       E.g: count_new_line_character('nguyen \n heelllo \n'): 2
    '''
    return len(re.findall('\\n', string))


In [9]:
# Anonymous functions
def get_number_lines(file):
    
    file_type = ''
    num_lines = count_new_line_character(file[1])
    
    if (re.search('.asm', file[0]) is not None):
        file_type = '.asm: '
    elif(re.search('.bytes', file[0]) is not None):
        file_type = '.bytes'
    
    return (get_fileID_from_file_path(file[0]), {file_type: num_lines})


# Testing out with Spark
files = sc.wholeTextFiles('dataSample/')

files.persist()

test = files.map(get_number_lines)

# Return RDD with only lines that contains text
test.collect()

[('01SuzwMJEIXsK7A8dQbl', {'.asm: ': 15282}),
 ('01kcPWA9K2BOxQeS5Rju', {'.bytes': 12288}),
 ('0qPGt4cRVk9NoiJgubf2', {'.bytes': 14848}),
 ('0qPGt4cRVk9NoiJgubf2', {'.asm: ': 127320}),
 ('01azqd4InC7m9JpocGv5', {'.bytes': 90624}),
 ('04BfoQRA6XEshiNuI7pF', {'.bytes': 195328}),
 ('0qjuDC7Rhx9rHkLlItAp', {'.bytes': 45056}),
 ('0ACDbR5M3ZhBJajygTuf', {'.asm: ': 274975}),
 ('04BfoQRA6XEshiNuI7pF', {'.asm: ': 4799}),
 ('02IOCvYEy8mjiuAQHax3', {'.asm: ': 78758}),
 ('0qjuDC7Rhx9rHkLlItAp', {'.asm: ': 235532}),
 ('01IsoiSMh5gxyDYTl4CB', {'.asm: ': 161528}),
 ('01azqd4InC7m9JpocGv5', {'.asm: ': 1392154}),
 ('0ACDbR5M3ZhBJajygTuf', {'.bytes': 98816}),
 ('01SuzwMJEIXsK7A8dQbl', {'.bytes': 7936}),
 ('01IsoiSMh5gxyDYTl4CB', {'.bytes': 118528}),
 ('02IOCvYEy8mjiuAQHax3', {'.bytes': 14336}),
 ('01kcPWA9K2BOxQeS5Rju', {'.asm: ': 1276})]

In [41]:
# Work with processed data returned from cloud Spark cluster
import csv 

train_processed = sc.textFile('train_processed.csv')
test_processed = sc.textFile('test_processed.csv')

# Tranforms to desired form 
train_processed = train_processed.map(lambda x: re.sub('\'', '', x))

train_transfromed = train_processed.collect()

test_processed = test_processed.map(lambda x: re.sub('\'', '', x))

test_transfromed = test_processed.collect()

# Output to csv files
file1 = open('train_transformed.csv', 'w')
for train in train_transfromed:
    file1.write('%s\n' %train)
file2= open('test_transformed.csv', 'w')
for test in test_transfromed:
    file2.write('%s\n' %test)

In [102]:
# Transformation
train = sc.textFile('train_transformed.csv')
test = sc.textFile('test_transformed.csv')

# Anonymous functions
def get_key_value(x):
    temp = x.split(',')
    return (temp[0], temp[1])

def get_csv_like_string(x):
    # Return a string: id, .asm, .bytes
    temp = ['', '', '']
    temp[0] = x[0]
    string_1 = x[1][0]
    string_2 = x[1][1]
    if(re.search('.asm', string_1) is not None):
        temp[1] = re.sub('\D', '', string_1)
        temp[2] = re.sub('\D', '', string_2)
    else:
        temp[1] = re.sub('\D', '', string_2)
        temp[2] = re.sub('\D', '', string_1)
    
    return temp[0]+ ',' + temp[1] + ',' + temp[2]

# Transform 
train = train.map(get_key_value).groupByKey().mapValues(list)
test = test.map(get_key_value).groupByKey().mapValues(list)


('zKER6PbI2rafDWCpkZm8', [' {.asm: : 212561}', ' {.bytes: 160256}'])

In [185]:
def make_class_labels_key_value_pair(x):
    temp = x.split(',')
    return (temp[0], temp[1])

def make_train_data_into_correct_shape(x):
    # Return a csv-like string: id, .asm, .bytes, class
    temp = ['', '', '', '']
    temp[0] = x[0]
    string_1 = x[1][0][0] # .asm 
    string_2 = x[1][0][1] # .bytes
    if(re.search('.asm', string_1) is not None):
        temp[1] = re.sub('\D', '', string_1)
        temp[2] = re.sub('\D', '', string_2)
    else:
        temp[1] = re.sub('\D', '', string_2)
        temp[2] = re.sub('\D', '', string_1)
    temp[3] = x[1][1][0]
    return temp[0]+ ',' + temp[1] + ',' + temp[2] + ',' + temp[3]

# Transformation
# Get class labels into the correct form
class_labels = sc.textFile('trainLabels/trainLabels.csv')
class_labels = class_labels.map(lambda x: re.sub('"', '', x)) # Remove double quote 
class_labels = class_labels.map(make_class_labels_key_value_pair)

new_test = test.map(get_csv_like_string).collect()
new_test.insert(0, 'id, .asm, .bytes')

new_train = train.cogroup(class_labels).mapValues(list)
new_train = new_train.mapValues(lambda x: (list(x[0])[0], list(x[1])))
new_train = new_train.map(make_train_data_into_correct_shape).collect()

new_train.insert(0, 'id, .asm, .bytes, class')


# Output to file
file1 = open('train_transformed_1.csv', 'w')
for i in new_train:
    file1.write('%s\n' %i)
    
file2= open('test_transformed_1.csv', 'w')
for j in new_test:
    file2.write('%s\n' %j)


## 2.1. Extracting n-gram opcode

In [11]:
## Utilities functions
import re

def get_op_code_from_asm_string(string):
    # Check if input string contains opcode 
    result = ''
    if (is_containing_op_code(string)):
        patterns_matched = re.findall(' [a-z]+', string[20:])
        if(len(patterns_matched) > 0):
            result = patterns_matched[0]
            result = result.replace(" ", "") # Remove empty character
    else:
        result = ''
    return result

def is_containing_op_code(string):

    result = re.search(':[0-9A-F]{8} {1,3}[0-9A-F]{2}', string[0:35])

    if (result is not None):
        result = True
    else:
        result = False

    return result

def get_all_op_codes_from_asm_content(string):
    temp = string.split('\n') # transform the whole content as array with element as lines
    result = ''
    for line in temp:
        if(is_containing_op_code(line)):
            result = result + get_op_code_from_asm_string(line).replace(' ', '') + ' '

    # Formatting the result string
    result = result[:-1] # remove the last empty character
    result = re.sub(' +', ' ', result)
    if(result[0:1] == ' '):
        result = result[1:] # remove blank at the beginning of the string if any
    return result

In [12]:
## Extracting
text = sc.wholeTextFiles('dataSample/*.asm')

text = text.map(lambda x: (get_fileID_from_file_path(x[0]), x[1]))

text = text.map(lambda x: (x[0], get_all_op_codes_from_asm_content(x[1])))

text.collect()

[('01SuzwMJEIXsK7A8dQbl',
  'dd dd dd dd dd dd dd dd dd dd dd dd db byte dd db db dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd dd push mov sub mov mov mov movzx mov movzx sub mov cmp jnz mov movsx cmp jz mov add mov mov add mov jmp mov mov pop retn dd dd dd dd dd dd dd dd dd db push mov sub mov mov mov mov mov mov mov mov cmp jnz mov jmp mov movzx cmp jz cmp jz sub mov jmp mov mov add mov mov cmp jz mov jmp mov add mov mov add mov mov movzx cmp jz or cmp jz mov movzx cmp jz mov jmp mov add mov cmp jz mov mov mov cmp jz mov mov cmp jz mov cmp jz mov mov mov mov mov pop retn push mov sub mov mov mov mov mov mov mov mov mov and cmp jz mov jmp mov push lea xor push push mov push call mov add mov cmp jnz mov jmp mov cmp jnz mov jmp mov mov add mov mov mov add mov mov jmp mov add mov mov add add mov mov mov cmp jnb mov mov mov or add cmp jz and mov and lea xor mov mov add c

In [7]:
import re
test_str = 'df   df fb df dr ds'
test = re.sub(' +', ' ', test_str)
test[1:]

'f df fb df dr ds'

In [32]:

import re

string = 'dv   '

['dsa 3', ' 3', ' 3']

## 3. Training  

In [245]:
# Using mock data to test xgboost library
import xgboost as xgb
import numpy as np
import pandas as pd

# Set up mock data to test xgboost
train_data = pd.read_csv('train_transformed_1.csv')

# Get the label
label = train_data[' class'] - 1
label = label.as_matrix()

file_id = train_data['id']

# Convert DataFrame to equivalent Numpy-array representation
data = train_data.drop(['id', ' class'], 1).as_matrix()

# Get the data in shape of DMatrix data structure to fit xgboost 
dtrain = xgb.DMatrix(data, label=label)

# Set up hyper parameters
params = {'max_depth':2, 'eta':1.0, 'silent':1, 'colsample_bytree': 1, 'objective':'multi:softprob',
          'min_child_weight': 2, 'num_class': 9}
 
num_round = 10

# Train the model
bst = xgb.train(params, dtrain, num_round)

# Make the predictions 
predictions = bst.predict(dtrain)

# Transfomr the result
pd.DataFrame(data=predictions)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.133574,0.016546,0.000676,4.513572e-03,0.007290,0.195532,0.005236,0.624921,0.011712
1,0.000200,0.992169,0.007279,1.823519e-06,0.000185,0.000010,0.000025,0.000067,0.000062
2,0.018838,0.072467,0.000074,1.492753e-03,0.000159,0.002931,0.000186,0.035148,0.868705
3,0.159718,0.062583,0.000244,9.394404e-03,0.003281,0.742157,0.002002,0.013374,0.007247
4,0.000027,0.000544,0.998949,2.817925e-07,0.000072,0.000160,0.000035,0.000195,0.000017
5,0.002242,0.008992,0.988051,6.861882e-06,0.000281,0.000047,0.000062,0.000089,0.000229
6,0.011519,0.024899,0.000053,9.281724e-02,0.000094,0.230859,0.000133,0.015777,0.623848
7,0.014712,0.048989,0.000639,1.001038e-05,0.000072,0.000068,0.005266,0.000130,0.930115
8,0.000048,0.000042,0.999260,4.960502e-07,0.000127,0.000282,0.000061,0.000149,0.000030
9,0.000048,0.000042,0.999260,4.960502e-07,0.000127,0.000282,0.000061,0.000149,0.000030


## 4. Model Evaluation - Using Multi Class Log Loss Metrics
[Link](https://www.kaggle.com/c/malware-classification/details/evaluation)

In [2]:
import numpy as np

### Refer to file test_multi_class_loss.py with pytest 2.8.5 to run tests

# Make multi class log loss function
def multi_class_log_loss(actuals, predictions):
    """ Implementation of multiclass log loss: 
    https://www.kaggle.com/wiki/MultiClassLogLoss.
    
    Parameters
    -------------
    ** actuals = numpy_array, shape = [n_samples, 1], 
              each_row_contains_true_class: integer from [0, n_classes - 1]
    
    ** predictions = numpy_array, shape = [n_samples, n_classes],
                     each_row_contains_predicted_probablities: float in [0,1] and sum approximately to 1
    
    Return Type
    -------------
    ** loss = float
    """
    
    n_samples = actuals.shape[0]

    # Auxiliary matrix for computational convenience purpose
    auxiliary = np.zeros(predictions.shape)
    auxiliary[np.arange(n_samples), actuals.astype(int)] = 1
    
    # Sum log
    sum_log = np.sum(auxiliary*np.log(predictions))
    
    loss = (-1.0)/(n_samples)*sum_log
    return loss



In [14]:
# Using Cross Validation
from sklearn.cross_validation import KFold
import xgboost as xgb
import pandas as pd

# Set up mock data to test xgboost
train_data = pd.read_csv('train_transformed_1.csv')

# Get the label
label = train_data[' class'] - 1

file_id = train_data['id']

# Convert DataFrame to equivalent Numpy-array representation
data = train_data.drop(['id', ' class'], 1).as_matrix()

# Set up KFold
kf = KFold(train_data.shape[0], 10, shuffle=False, random_state=None)

# Set up hyper parameters
params = {'max_depth': 4, 'eta':1.0, 'silent':1, 'colsample_bytree': 1, 'objective':'multi:softprob',
          'min_child_weight': 2, 'num_class': 9}
 
num_round = 10

# Initialize scores
scores = np.array([])

# Evaluate using log loss function for multi-class classification
for train, test in kf:
    train_set = data[train]
    label_train = label[train]
    test_set = data[test]
    label_test = label[test].as_matrix()
    
    # Transform into the proper data shape
    train_set = xgb.DMatrix(train_set, label=label_train)
    test_set = xgb.DMatrix(test_set)
    
    # Train the model
    bst = xgb.train(params, train_set, num_round)
    
    # Predict
    predictions = bst.predict(test_set)
    
    # Calculate log loss
    scores = np.append(scores, multi_class_log_loss(label_test, predictions))
    

# Print the mean multi class log loss

scores.mean()

0.25985209445044843

1.3333333333333333

## 5. Final Submission

In [284]:
# import xgboost
import xgboost as xgb

### Train the model into training data
# Set up mock data to test xgboost
train_data = pd.read_csv('train_transformed_1.csv')
test_data = pd.read_csv('test_transformed_1.csv')

# Get the label
train_label = train_data[' class'] - 1

# File id for test_set
file_ids = test_data['id']

# Convert DataFrame to equivalent Numpy-array representation
train_data = train_data.drop(['id', ' class'], 1).as_matrix()
test_data = test_data.drop(['id'], 1).as_matrix()
train_data = xgb.DMatrix(train_data, label=train_label)
test_data = xgb.DMatrix(test_data)

# Set up hyper parameters
params = {'max_depth': 4, 'eta':1.0, 'silent':1, 'colsample_bytree': 1, 'objective':'multi:softprob',
          'min_child_weight': 2, 'num_class': 9}
 
num_round = 10

# Training
bst = xgb.train(params, train_data, num_round)

# Predict the test set
predictions = bst.predict(test_data)

predictions = pd.DataFrame(data=predictions)

# Transform the result into expected form
submissions = pd.DataFrame({
        'Id': file_ids,
        'Prediction1': predictions[0],
        'Prediction2': predictions[1],
        'Prediction3': predictions[2],
        'Prediction4': predictions[3],
        'Prediction5': predictions[4],
        'Prediction6': predictions[5],
        'Prediction7': predictions[6],
        'Prediction8': predictions[7],
        'Prediction9': predictions[8]
    })
# Get submitted csv file
submissions.to_csv('submissions.csv', index=False)