In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, Imputer, OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
import numpy as np
import seaborn as sns
import operator
import math
import tensorflow as tf

%matplotlib inline



In [52]:
# Import
test = pd.read_csv('data/test.csv', sep=';', na_values='(MISSING)')
test['VARIABLE_CIBLE'] = 'UNKNOWN'
test['index_origin'] = test.index.tolist()

train = pd.read_csv('data/train.csv', sep=';', na_values='(MISSING)')
train['index_origin'] = -1

piv_train = train.shape[0]

# Creating a DataFrame with train+test data
df_all = pd.concat((train, test), axis=0, ignore_index=True)

In [53]:
df_all0 = df_all[df_all.SOURCE_CITED_AGE == 'IMPUT']
df_all1 = df_all[df_all.SOURCE_CITED_AGE == 'CALC']

In [61]:
class Preprocessings():
    
    
    def __init__(self, date_columns=None, cols_toDrop=None):
        self.dateCols = date_columns
        self.colsToDrop = cols_toDrop
        
        self.numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
        
        
    def datetime_processings(self, df, format=None):
        # converting string dates in dateTime format :
        df[self.dateCols] = df[self.dateCols].apply(lambda col : pd.to_datetime(col, format=format))

        # PRIORITY_MONTH = BEGIN_MONTH :
        df = df.drop(['PRIORITY_MONTH'], axis=1)

        # creating features of duration between dates :
        df['filing-begin'] = (df.FILING_MONTH - df.BEGIN_MONTH).dt.days
        df['pub-filing'] = (df.PUBLICATION_MONTH - df.FILING_MONTH).dt.days
        df['pub_year'] = df.PUBLICATION_MONTH.dt.year
        df = df.drop(['FILING_MONTH', 'PUBLICATION_MONTH','BEGIN_MONTH'], axis=1)
        
        df = df.drop('cited_nmiss', axis=1)
        
        return df
    
    
    def cat_to_codes(self, df):
        """
        converting categorical data into numerical codes :
        NaN's will be replaced by -1
        And replacing -1 by the meadian value of each categorical column
        """
        
        self.non_num_cols = df.select_dtypes(exclude=self.numerics).columns.difference(self.dateCols)

        df[self.non_num_cols] = df[self.non_num_cols].apply(lambda col : col.astype('category').cat.codes)
        
        df[self.non_num_cols] = df[self.non_num_cols].replace(-1, df[self.non_num_cols].median())
                
        return df
    
    
def re_split(df, target, split_value=None):
    """
    When eval data set is imported, you need to create a column 'target' and filling it with
    a value that is not in the train data set. By doing so, after cat_to_codes the eval set is
    identified by the largest value in categorical 'target'
    """
    if split_value == None:
        unknown_target = df[target].unique().max()
    else :
        unknown_target = split_value
        
    df_train = df[df[target] != unknown_target]
    df_eval = df[df[target] == unknown_target]
    df_eval = df_eval.drop(target, axis=1)
        
    return df_train, df_eval
    


In [55]:
def value_counts(df):
    for col in df.columns:
        print(col)
        vals = df[col].value_counts(normalize=True, dropna=False).reset_index()
        print(vals)
        print()
        print(vals[vals['index'].isnull()])
        print('-------------------------------------')

In [35]:
class Used_features():
    
    def __init__(self, target):
        self.target = target
        
    def fit_transform(self, df):
        self.columns_Y = [col for col in df.columns if len(df[col].unique())>=2]
        self.columns_X = self.columns_Y.copy()
        self.columns_X.append(self.target) 
        return df[self.columns_Y]
    
    def transform(self, df):
        return df[self.columns_X]

In [56]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
date_columns = ['PRIORITY_MONTH', 'FILING_MONTH', 'PUBLICATION_MONTH', 'BEGIN_MONTH']

In [57]:
preproc = Preprocessings(date_columns=date_columns)

df0 = preproc.datetime_processings(df_all0, format='%m/%Y')
df0 = preproc.cat_to_codes(df0)
df0 = df0.fillna(df0.median())

preproc = Preprocessings(date_columns=date_columns)

df1 = preproc.datetime_processings(df_all1, format='%m/%Y')
df1 = preproc.cat_to_codes(df1)
df1 = df1.fillna(df1.median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [72]:
df0_train, df0_eval = re_split(df0, 'VARIABLE_CIBLE')
df1_train, df1_eval = re_split(df1, 'VARIABLE_CIBLE')

In [73]:
keep_feat = Used_features('VARIABLE_CIBLE')

print(df0_eval.shape)
df0_eval = keep_feat.fit_transform(df0_eval)
df0_train = keep_feat.transform(df0_train)
print(df0_eval.shape)


print(df1_eval.shape)
df1_eval = keep_feat.fit_transform(df1_eval)
df1_train = keep_feat.transform(df1_train)
print(df1_eval.shape)



(84092, 48)
(84092, 36)
(45623, 48)
(45623, 45)


# Benchmark model

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [74]:
features0 = df0_eval.columns.tolist()
features0.remove('index_origin')
features1 = df1_eval.columns.tolist()
features1.remove('index_origin')

In [None]:
params = {'n_estimators':np.arange(5,1500,100),'max_depth':np.arange(1,110,10)}

est = RandomForestClassifier(n_jobs=6)

clf = GridSearchCV(estimator=est, param_grid=params, verbose=3)

clf.fit(df0_train[features0], df0_train['VARIABLE_CIBLE'])

Fitting 3 folds for each of 165 candidates, totalling 495 fits
[CV] n_estimators=5, max_depth=1 .....................................
[CV] ............ n_estimators=5, max_depth=1, score=0.568947 -   0.3s
[CV] n_estimators=5, max_depth=1 .....................................
[CV] ............ n_estimators=5, max_depth=1, score=0.568947 -   0.3s
[CV] n_estimators=5, max_depth=1 .....................................
[CV] ............ n_estimators=5, max_depth=1, score=0.568949 -   0.3s
[CV] n_estimators=105, max_depth=1 ...................................
[CV] .......... n_estimators=105, max_depth=1, score=0.568947 -   1.2s
[CV] n_estimators=105, max_depth=1 ...................................
[CV] .......... n_estimators=105, max_depth=1, score=0.568947 -   1.2s
[CV] n_estimators=105, max_depth=1 ...................................
[CV] .......... n_estimators=105, max_depth=1, score=0.568949 -   1.1s
[CV] n_estimators=205, max_depth=1 ...................................
[CV] .........

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed: 50.0min



[CV] n_estimators=1205, max_depth=21 .................................
[CV] ........ n_estimators=1205, max_depth=21, score=0.662439 - 1.3min
[CV] n_estimators=1205, max_depth=21 .................................
[CV] ........ n_estimators=1205, max_depth=21, score=0.658527 - 1.3min
[CV] n_estimators=1305, max_depth=21 .................................
[CV] ........ n_estimators=1305, max_depth=21, score=0.660918 - 1.4min
[CV] n_estimators=1305, max_depth=21 .................................
[CV] ........ n_estimators=1305, max_depth=21, score=0.662492 - 1.3min
[CV] n_estimators=1305, max_depth=21 .................................
[CV] ........ n_estimators=1305, max_depth=21, score=0.659367 - 1.4min
[CV] n_estimators=1405, max_depth=21 .................................
[CV] ........ n_estimators=1405, max_depth=21, score=0.660525 - 1.5min
[CV] n_estimators=1405, max_depth=21 .................................
[CV] ........ n_estimators=1405, max_depth=21, score=0.662582 - 1.5min
[CV] 