In [26]:
import pandas as pd
import pandas_ml as pdml
import numpy as np
from sklearn import cross_validation, metrics   #Additional scklearn functions

from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.metrics import roc_auc_score as AUC

import timeit

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

pd.set_option('display.max_rows', 8)
pd.set_option('display.max_rows', 6)

# train_df = pd.read_csv('data.csv',header=0, index_col=0, nrows=20000)
train_df = pd.read_csv('data.csv',header=0, index_col=0)
test_df = pd.read_csv('quiz.csv',header=0, index_col=0)
print(train_df.shape)
# convert to pdml.ModelFrame

#train_df = pdml.ModelFrame(train_df,target='label')
train_df=train_df.dropna()
#test_df = pdml.ModelFrame(test_df)
test_df=test_df.dropna()

print(test_df.shape)

(126837, 52)
(31709, 51)


In [27]:
from sklearn.preprocessing import PolynomialFeatures

# polynomail feature will be added after executing impute function, since 
# leveraging the filled in mediums should be beneficial.
def add_polynomial_feauture(A):
    numeric_cols = A.loc[:,['59', '60']]
    A['68'] = A['59']*A['60']
    return A



In [28]:
# Generate new features here ...
# from sklearn.preprocessing import FunctionTransformer

def new_bigram(A):
    # column_key = 65 is a bigram of feature 7 and 8
    A['65'] = A['7'].map(str) + '-' + A['8']
    # column_key = 66 is a bigram of feature 16 and 17
    A['66'] = A['16'].map(str) + '-' + A['17']

    return A

# add bigram here...
train_df = new_bigram(train_df)
test_df = new_bigram(test_df)


In [31]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)    

    
feature_columns_to_use = ([x for x in train_df.columns])
print(feature_columns_to_use)

# when adding new features above, the labels won't be at the last column
# so I drop it by name

feature_columns_to_use = [f for f in feature_columns_to_use if f != 'label']
print(feature_columns_to_use)


numeric_cols = ['59','60']
nonnumeric_columns = feature_columns_to_use[0:46]+feature_columns_to_use[48:]


# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)


['2', '5', '7', '8', '9', '11', '14', '16', '17', '18', '20', '23', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '62', '63', '64', 'label', '65', '66']
['2', '5', '7', '8', '9', '11', '14', '16', '17', '18', '20', '23', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '62', '63', '64', 'label', '65', '66']


In [33]:
# To handle categorical features, we need to change
# them to columns of integer values.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])
    
    
# remove the test data
big_X_imputed_test = big_X_imputed[train_df.shape[0]::]
big_X_imputed = big_X_imputed[0:train_df.shape[0]]

# feature expansion polynomial
big_X_imputed = add_polynomial_feauture(big_X_imputed)
big_X_imputed_test = add_polynomial_feauture(big_X_imputed_test)
# print(big_X_imputed)


In [23]:
# Prepare the inputs for the model
train_X = big_X_imputed.as_matrix()
test_X = big_X_imputed_test.as_matrix()
train_y = train_df['label']

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation



# Create and fit an AdaBoosted decision tree
forest = RandomForestClassifier(n_estimators = 100)

# bdt = AdaBoostClassifier(forest,
#                          algorithm="SAMME.R", 
#                          n_estimators=200,
#                          learning_rate=1)

n = train_X.shape[0]
start_time = timeit.default_timer()


# forest = forest.fit(train_X[:int(n*9/10)], train_y[:int(n*9/10)])
forest = forest.fit(train_X, train_y)
# bdt = bdt.fit(train_X, train_y)
# scores = cross_validation.cross_val_score(bdt, train_X, train_y, cv=3)

scores = cross_validation.cross_val_score(forest, train_X, train_y, cv=3)

elapsed = timeit.default_timer() - start_time
print('Classifier Trained!')
print('Time: ' + str(elapsed))

print(scores)
print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))


Classifier Trained!
Time: 86.44309179000265
[ 0.94077578  0.94597791  0.94082028]
Accuracy: 0.9425 (+/- 0.005)


<h2>Records</h2>
<h4>Only Random Forest</h4>

** n = 25**<br>
[ 0.93912015, 0.94198065, 0.93838403] -- Accuracy: 0.9398 (+/- 0.003)

** n =50**<br>
[ 0.94157994, 0.94564677, 0.9409622 ] -- Accuracy: 0.9427 (+/- 0.004)

** n =80**<br>
[ 0.94162725  0.94541025  0.94103316] -- Accuracy: 0.9427 (+/- 0.004)

**n = 200** <br>
[ 0.94077578, 0.94517373, 0.9409149 ] -- Accuracy: 0.94 (+/- 0.00)

**n = 230**<br>
[ 0.94139073, 0.94567043, 0.94108047] -- Accuracy: 0.94 (+/- 0.00)

**n = 250**<br>
[ 0.94127247  0.94609617  0.941317  ] -- Accuracy: 0.94 (+/- 0.00)

** n = 300 **<br>
[ 0.94139073, 0.9464273, 0.94105681] -- Accuracy: 0.9430 (+/- 0.005)

** n = 300 ** <br>
[ 0.94117786  0.94604887  0.94115143] -- Accuracy: 0.9428 (+/- 0.005)


<h4>Adaboost + Random Forest </h4>

** tree = 3, n= 100** <br>
[ 0.93219016, 0.93583103, 0.93649179] -- Accuracy: 0.9348 (+/- 0.004)

** tree = 3, n=200** <br>
[ 0.93261589, 0.93628042, 0.93384266] -- Accuracy: 0.9342 (+/- 0.003)

** tree = 3, n=200, learning_rate=1** <br>
[ 0.93297067, 0.93590198, 0.9345759 ] -- Accuracy: 0.9345 (+/- 0.002)

In [29]:
'''
Work on the test data and generate output file here
'''

predictions = forest.predict(test_X)


indx=[x for x in range(1,31710)]
submission = pd.DataFrame({'Id': indx,
                           'Prediction': predictions })
submission.to_csv("submission-rf-1.csv", index=False, sep=",")