# Summary of the models that work fine

## Step 0: Get the library and utilities

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
from sklearn.mixture import GMM
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
%matplotlib inline

In [6]:
# prediction and saving
def test_save(estimator, test):
    num = test.shape[0]
    test_label = pd.DataFrame({"Id": range(1,num+1), "Solution": estimator.predict(test)}, index=None)
    test_label.to_csv('./test_label-'+str(datetime.date.today())+'.csv', index = None)

In [36]:
# gridsearchcv wrapper
def gridsearchcv_wrapper(model, parameters, x, y, scores = ['accuracy']):
    ### wrapper for GridSearchCV
    ### model: model to be optimized
    ### parameters: parameter space for gridsearchcv
    ### x, y: training set
    ### scores: list of scoring functions
    
    # get train and test data
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, random_state = 0, test_size = 0.1)
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, parameters, cv=3,
                           scoring=score)
        clf.fit(x_train, y_train)

        print("Best parameters set found on development set:")
        print
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean_score, scores.std() * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(x_test)
        print(classification_report(y_true, y_pred))
        print()
    return clf.best_estimator_

## Step 1: Get the data

In [2]:
train = pd.read_csv('train.csv', header=None)
train.columns = range(1, 41)
label = pd.read_csv('trainLabels.csv', header=None)
label.columns = ['label']
test = pd.read_csv('test.csv', header=None)
test.columns = range(1, 41)

## Step 2: GMM + Random Forest
### Step 2.1: GMM

In [4]:
# Perform GMM on full test data to get clusters. Copy the code from the following reference except the use of aic
# http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_selection.html#example-mixture-plot-gmm-selection-py
#x_all = x_train.append(x_test)
x_all = train.append(test)
x_all = x_all.values

lowest_aic = np.infty
aic = []
n_components_range = range(2, 20)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GMM(n_components=n_components, covariance_type=cv_type)
        gmm.fit(x_all)
        aic.append(gmm.aic(x_all)) # aic is used. Wiki suggests that aic is better than bic
        if aic[-1] < lowest_aic:
            lowest_aic = aic[-1]
            best_gmm = gmm

### Step 2.2: Random Forest

In [63]:
# Fit with randomforest
x_train, x_test, y_train, y_test = cross_validation.train_test_split(train, label, random_state = 0)
x_train_gmm = best_gmm.predict_proba(x_train)
# both classifiers seem to work, but the first one is slightly better. (from Elena Cuoco)
clf=RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=5, min_samples_split=1,
  min_samples_leaf=3, max_features='auto',    bootstrap=False, oob_score=False, n_jobs=1, random_state=33,
  verbose=0)
# clf=RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=5)

clf.fit(x_train_gmm, y_train.values.ravel())
clf.score(best_gmm.predict_proba(x_test), y_test)
test_save(clf, best_gmm.predict_proba(test)) # 0.99161 final score

### Step 2.3: PCA + GMM + SVM

In [59]:
# pca + gmm + svm
pca = PCA(n_components=12)  # This should be optimized by gridsearchcv
pca.fit(x_all)
raw_pca = pca.transform(train)
tuned_parameters = {'C': np.arange(0.1, 5, 0.5), 'gamma': np.arange(0.1, 1, 0.1), 'kernel': ['rbf']}
gmm = GMM(n_components= 4, covariance_type= 'full') # this should be optimized by gridsearchcv as well
gmm.fit(raw_pca)
x_train_gmm = gmm.predict_proba(raw_pca)
clf = gridsearchcv_wrapper(svm.SVC(), tuned_parameters, x_train_gmm, label['label']) 
# notice that we need to use provide 1D np array for a series as Y (using label doesn't work)
test_save(clf, gmm.predict_proba(pca.transform(test))) # 0.99113 final score

# Tuning hyper-parameters for accuracy
()
Best parameters set found on development set:

{'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
()
Grid scores on development set:
()
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.10000000000000001}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.20000000000000001}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.30000000000000004}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.40000000000000002}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.5}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.60000000000000009}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.70000000000000007}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.80000000000000004}
0.996 (+/-0.008) for {'kernel': 'rbf', 'C': 0.100000000