# Microsoft Kaggle Classification Challenge

[Link](https://www.kaggle.com/c/malware-classification)

## 1. Read and Examine Data Characteristics


In [2]:
### Using pandas as Python Data Analysis Tools
import pandas as pd

submission_sample = pd.read_csv("./sampleSubmission/sampleSubmission.csv")

train_labels = pd.read_csv("./trainLabels/trainLabels.csv")


In [2]:
submission_sample.shape

(10873, 10)

In [3]:
submission_sample.head()

Unnamed: 0,Id,Prediction1,Prediction2,Prediction3,Prediction4,Prediction5,Prediction6,Prediction7,Prediction8,Prediction9
0,RYzecbHASsni7N51DrgB,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
1,y0iSXI1lwemrq39buQds,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
2,OCsctgA5MWGXHP1vo0qx,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
3,kGW27noyvwBUJXeMQzgI,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
4,rHiws5yCIjSvcz1M0U8Q,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111


In [4]:
train_labels.shape

(10868, 2)

In [5]:
train_labels.head(10)

Unnamed: 0,Id,Class
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1
5,0AwWs42SUQ19mI7eDcTC,1
6,0cH8YeO15ZywEhPrJvmj,1
7,0DNVFKwYlcjO7bTfJ5p1,1
8,0DqUX5rkg3IbMY6BLGCE,1
9,0eaNKwluUmkYdIvZ923c,1


## 2. Feature Engineering

## 3. Mock data 

In [70]:
# Using mock data to test xgboost library
import xgboost as xgb
import numpy as np
import pandas as pd

# Set up mock data to test xgboost
mock_data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

label = mock_data[0]

# Convert DataFrame to equivalent Numpy-array representation
data = mock_data.drop(0, 1).as_matrix()

# Get the data in shape of DMatrix data structure to fit xgboost 
dtrain = xgb.DMatrix(data, label=label)

# Set up hyper parameters
params = {'max_depth':2, 'eta':1.0, 'silent':1, 'colsample_bytree': 1, 'objective':'multi:softprob',
          'min_child_weight': 2, 'num_class': 4}
 
num_round = 10

# Train the model
bst = xgb.train(params, dtrain, num_round)

# Make the predictions 
predictions = bst.predict(dtrain)

# Transfomr the result
pd.DataFrame(data=predictions)



Unnamed: 0,0,1,2,3
0,0.001424,0.995531,0.001612,0.001433
1,0.001824,0.994200,0.002142,0.001835
2,0.001824,0.994277,0.002064,0.001835
3,0.001423,0.994965,0.001611,0.002000
4,0.035030,0.890084,0.039638,0.035247
5,0.001423,0.994965,0.001611,0.002000
6,0.001824,0.994277,0.002064,0.001835
7,0.001425,0.995909,0.001232,0.001434
8,0.001820,0.992385,0.003963,0.001832
9,0.001813,0.988179,0.007462,0.002547


## 4. Model Evaluation - Using Multi Class Log Loss Metrics
[Link](https://www.kaggle.com/c/malware-classification/details/evaluation)

In [31]:
import numpy as np

### Refer to file test_multi_class_loss.py with pytest 2.8.5 to run tests

# Make multi class log loss function
def multi_class_log_loss(actuals, predictions):
    """ Implementation of multiclass log loss: 
    https://www.kaggle.com/wiki/MultiClassLogLoss.
    
    Parameters
    -------------
    ** actuals = numpy_array, shape = [n_samples, 1], 
              each_row_contains_true_class: integer from [0, n_classes - 1]
    
    ** predictions = numpy_array, shape = [n_samples, n_classes],
                     each_row_contains_predicted_probablities: float in [0,1] and sum approximately to 1
    
    Return Type
    -------------
    ** loss = float
    """
    
    n_samples = actuals.shape[0]

    # Auxiliary matrix for computational convenience purpose
    auxiliary = np.zeros(predictions.shape)
    auxiliary[np.arange(n_samples), actuals.astype(int)] = 1
    
    # Sum log
    sum_log = np.sum(auxiliary*np.log(predictions))
    
    loss = (-1.0)/(n_samples)*sum_log
    return loss



In [66]:
# Using Cross Validation
from sklearn.cross_validation import KFold
import xgboost as xgb
import pandas as pd

# Set up mock data to test xgboost
mock_data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

# Convert DataFrame to equivalent Numpy-array representation
data = mock_data.drop(0, 1).as_matrix()

# Set up labels 
label = mock_data[0]

# Set up KFold
kf = KFold(mock_data.shape[0], 5, shuffle=False, random_state=None)

# Set up hyper parameters
params = {'max_depth': 4, 'eta':1.0, 'silent':1, 'colsample_bytree': 1, 'objective':'multi:softprob',
          'min_child_weight': 2, 'num_class': 4}
 
num_round = 10

# Initialize scores
scores = np.array([])

# Evaluate using log loss function for multi-class classification
for train, test in kf:
    train_set = data[train]
    label_train = label[train]
    test_set = data[test]
    label_test = label[test].as_matrix()
    
    # Transform into the proper data shape
    train_set = xgb.DMatrix(train_set, label=label_train)
    test_set = xgb.DMatrix(test_set)
    
    # Train the model
    bst = xgb.train(params, train_set, num_round)
    
    # Predict
    predictions = bst.predict(test_set)
    
    # Calculate log loss
    scores = np.append(scores, multi_class_log_loss(label_test, predictions))
    

# Print the mean multi class log loss
scores.mean()

0.22731585854657052

In [50]:
a = np.array([])
a = np.append(a, 1)
a = np.append(a, 2)
a = np.append(a, 1)
a.mean()

1.3333333333333333

## 5. Final Submission

In [17]:
# import xgboost
import xgboost as xgb

# Train the model into training data

# Predict the test set

# Transform the result into expected form

# Get submitted csv file