In [6]:
"""
Quora pairs Kaggle competition
Third Quora Submission
@author: Luis Duque
"""

import os
import time
import pandas as pd
import numpy as np
import csv
import re, math
from string import punctuation
from difflib import SequenceMatcher
from collections import Counter
from matplotlib import pyplot as plt
from operator import xor
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn import tree
from sklearn import metrics
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

In [7]:
###############################################################################################################
###   Tunning a Random Forest Classifier: select parameters and features 
def TuningParametersRandomForest(X_train, Y_train, sample_leaf_options, n_estimators_options, max_features_options):
    best_score = 1
    for leaf_size in sample_leaf_options:
        for num_estimators in n_estimators_options:
            for maxfeatures in max_features_options:
                model = RandomForestClassifier(warm_start=True, oob_score = True, n_estimators = num_estimators, min_samples_leaf = leaf_size, max_features= maxfeatures, n_jobs = -1,random_state =50) 
                model.fit(X_train, Y_train)
                oob_error = 1 - model.oob_score_ 
                
                if  oob_error < best_score:
                    best_score = oob_error
                    best_model = model                
                
                best_score = min(oob_error, best_score)
                print "samples_leaf=", leaf_size, " estimators=", num_estimators, " error=", oob_error, " max_features=", maxfeatures , " best_score =", best_score
    
    print "So far our error is ", 1 - best_model.oob_score_ 
    return best_model
    
    
###############################################################################################################
#### Feature selection for a Random Forest Classifier 
def BackwardSubsetSelection(X, y):
    ## Only for a piece of the training set (temporal)
    X_train = X
    y_train = y
    best_model = RandomForestClassifier(warm_start=True, oob_score = True, n_jobs = -1,random_state =50)
    best_model.fit(X_train, y_train)
    currenterror = 1 - best_model.oob_score_ 
    print "Error we want to beat = ", currenterror
    currentfeatures = list(X_train)## get the features from the data frame
    removed_features = 0
    while True:
        newfeatures, newerror = RemoveOneFeature( X_train, y_train, currentfeatures, currenterror)
        if(len(newfeatures)< len(currentfeatures)):
            currenterror = newerror
            currentfeatures = newfeatures
            removed_features = removed_features +1
        else:
            break
    print "We removed the features: ", np.setdiff1d(list(X), currentfeatures) 
    print "Our error is now", newerror
    return currentfeatures

 
def RemoveOneFeature(dftrain, target, currentfeatures, error):    
    for feature in currentfeatures:
            newfeatures = list(currentfeatures)
            newfeatures.remove(feature)
                        
            ## We now make a Classification with one feature less 
            newmodel = RandomForestClassifier(warm_start=True, oob_score = True, n_jobs = -1,random_state =50)
            X = dftrain[newfeatures]
            Y = target
            newmodel.fit(X, Y)
            newerror = 1 - newmodel.oob_score_ ## Find the Out of Box Error
            print "Current features ", len(currentfeatures), ", removing feature ", feature, "would give error=", newerror
            if newerror < error:
                print "Removed:", feature
                return newfeatures, newerror

    return currentfeatures, error
    
    
    
def Submission2(X, Y, X_test, filename):
    
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2, random_state=1)
    
    clf = RandomForestClassifier(n_jobs = -1,random_state =50) #Initialize with whatever parameters you want to
    
    # Parameters that will be used for tunning the random forest
    param_grid = {
                 'n_estimators': [150, 200, 300, 400,500],
                 'min_samples_leaf' : [3 ,5, 10, 15]
    }
    
    Gmodel = GridSearchCV(clf, param_grid, cv=5)
    Gmodel.fit(X_train, Y_train)    
    
    Y_test =Gmodel.predict(X_test)
    temp = {'is_duplicate': Y_test}
    submissionDf = pd.DataFrame.from_dict(temp)
    submissionDf.to_csv(filename, header = True)
    
    best_model = Gmodel.best_estimator_ 
    
    print "\n Best model found has n_estimators = ", best_model.n_estimators, " min_samples_leaf = ", best_model.min_samples_leaf,  "max_features=",  best_model.max_features
    print "\n Confussion Matrix for the validation set= "
    Y_predict = Gmodel.predict(X_validation)
    CM = confusion_matrix(list(Y_validation), list(Y_predict))
    print CM
    
    print "\n Classification error rate in the validation data set", float(CM[0,1] + CM[1,0])/(CM[0,0] + CM[1,1]+ CM[0,1] + CM[1,0]) 
    
    
    return submissionDf

In [8]:
#################################################################################################################
###############################             MAIN                 ################################################
#################################################################################################################

In [9]:
## Loading My features (Must run DictinaryCreation.ipynb and FeatureEngineering.ipynb before)
Mtrain = pd.DataFrame.from_csv("Mtrain.csv")
Mtest = pd.DataFrame.from_csv("Mtest.csv")

## Load Abhisheks features
Atrain = pd.DataFrame.from_csv("Abhitrain.csv")
Atest = pd.DataFrame.from_csv("Abhitest.csv")

## Loading the target
Ytrain = pd.DataFrame.from_csv("./data/train.csv")["is_duplicate"]

In [10]:
## SECOND EXPERIMENT: My features and Abhisheks variables together
Atrain = Atrain.reset_index()
Mtrain = Mtrain.reset_index()
Atest = Atest.reset_index()
Mtest = Mtest.reset_index()

Xtrain = pd.concat([Atrain, Mtrain], axis=1)
Xtest = pd.concat([Atest, Mtest], axis=1)

## Deal with infinite rows and things like that
Xtrain = Xtrain.replace([np.inf, -np.inf], np.nan)
Xtrain = Xtrain.replace([np.nan], 0)
Xtest =  Xtest.replace([np.inf, -np.inf], np.nan)
Xtest  = Xtest.replace([np.nan], 0)

## Feature selection
#features = BackwardSubsetSelection(Xtrain, Ytrain)

## Tuning parameters
#best_model = TuningParametersRandomForest(Xtrain, Ytrain, sample_leaf_options, n_estimators_options, max_features_options)

## Generate submission
Submissiondf = Submission2(Xtrain, Ytrain, Xtest, "SubmissionAM.csv")


 Best model found has n_estimators =  500  min_samples_leaf =  3 max_features= auto

 Confussion Matrix for the validation set= 
[[43797  7282]
 [ 6179 23600]]

 Classification error rate in the validation data set 0.166477033812


In [11]:
## FIRST EXPERIMENT: Only Abhisheks features
Xtrain = Atrain
Xtest = Atest

## Deal with infinite rows and things like that
Xtrain = Xtrain.replace([np.inf, -np.inf], np.nan)
Xtrain = Xtrain.replace([np.nan], 0)
Xtest =  Xtest.replace([np.inf, -np.inf], np.nan)
Xtest  = Xtest.replace([np.nan], 0)

## Feature selection
#features = BackwardSubsetSelection(Xtrain, Ytrain)

## Tuning parameters
#best_model = TuningParametersRandomForest(Xtrain, Ytrain, sample_leaf_options, n_estimators_options, max_features_options)
## Generate submission
Submissiondf = Submission2(Xtrain, Ytrain, Xtest, "SubmissionA.csv")



 Best model found has n_estimators =  500  min_samples_leaf =  3 max_features= auto

 Confussion Matrix for the validation set= 
[[40049 11030]
 [ 7845 21934]]

 Classification error rate in the validation data set 0.233433921195
