In [138]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

%matplotlib inline

In [139]:
# date-parser
date_parser_func = lambda x: pd.datetime.strptime(x, '%d-%b-%y')

In [140]:
data = pd.read_csv('./data/Train_seers_accuracy.csv', parse_dates=['Transaction_Date', 'DOB'], date_parser=date_parser_func)
sub = pd.read_csv('./data/Sample_K7zT2mf.csv')

In [141]:
data.loc[:, 'transaction_year'] = data.Transaction_Date.dt.year
data.loc[:, 'Purchased_in_Sale'] = (data.Purchased_in_Sale=='Y').astype(int)
data.loc[:, 'Referred_Friend'] = (data.Referred_Friend=='YES').astype(int)

In [142]:
categorical_features = ['Gender', 'Lead_Source_Category', 'Sales_Executive_Category', 'Sales_Executive_ID', \
                        'Store_ID', 'Payment_Mode', 'Product_Category']

for feat in categorical_features:
    lbl = LabelEncoder()
    lbl.fit(data[feat])
    
    data[feat] = lbl.fit_transform(data[feat])
    

In [143]:
assert data.isnull().any().sum() == 0

## Split into training and test sets

In [144]:
mask = data.Transaction_Date < pd.to_datetime('2006-01-01')
X = data[mask]
X_test = data[~mask]

transaction_freq_map_train = data[mask].Client_ID.value_counts().map(lambda x: int(x > 1)).to_dict()
transaction_freq_map_test = data[~mask].Client_ID.value_counts().map(lambda x: int(x > 1)).to_dict()

y = X.Client_ID.map(lambda x: transaction_freq_map_train[x] if x in transaction_freq_map_train else 0)
y_test = X_test.Client_ID.map(lambda x: transaction_freq_map_test[x] if x in transaction_freq_map_test else 0)

In [145]:
assert X.shape[0] == y.shape[0]
assert X_test.shape[0] == y_test.shape[0]

## Features

In [163]:
features = ['Client_ID','Store_ID', 'Number_of_EMI', 'Var1', 'Var2', 'Var3',\
            'Gender', 'Referred_Friend', 'Sales_Executive_ID', 'Sales_Executive_Category', 'Lead_Source_Category',\
            'Payment_Mode', 'Product_Category', 'Transaction_Amount', 'transaction_year']

In [169]:
est = RandomForestClassifier(n_estimators=100, max_depth=8, max_features='sqrt', class_weight='auto', n_jobs=-1)
est.fit(X[features], y)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [170]:
predsTrain = est.predict_proba(X[features])[:, 1]
predsTest = est.predict_proba(X_test[features])[:, 1]

In [171]:
assert predsTrain.shape[0] == y.shape[0]
assert predsTest.shape[0] == y_test.shape[0]

In [172]:
print 'ROC AUC score on training set %f ' %(roc_auc_score(y, predsTrain))
print 'ROC AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

ROC AUC score on training set 0.810671 
ROC AUC score on test set 0.765770 


## Submission

In [179]:
est.fit(X_test[features], y_test)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [180]:
data.loc[:, 'Cross_Sell'] = est.predict_proba(data[features])[:, 1]

In [182]:
data.groupby(['Client_ID'])['Cross_Sell'].mean().reset_index().to_csv('./submissions/supervised_setting_rf.csv', index=False)