In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, log_loss

In [2]:
df_train = pd.read_csv('data/processed_data/processed_train.csv')
df_test = pd.read_csv('data/processed_data/processed_test.csv')

In [3]:
print(df_train.shape)
print(df_test.shape)

(95674, 1025)
(95674, 1024)


In [4]:
cols = df_train.columns
cols

Index(['VisitNumber', 'TripType', 'Weekday', 'Upc', 'ScanCount',
       'FinelineNumber', 'FLCount', 'VisitFLCount', 'NumPurchases',
       'NumReturns',
       ...
       'UPC_7874298393.0', 'UPC_7874298761.0', 'UPC_7976503128.0',
       'UPC_8265750406.0', 'UPC_83032400641.0', 'UPC_87458603436.0',
       'UPC_88828940068.0', 'UPC_9506.0', 'UPC_9518801128.0',
       'UPC_980012301.0'],
      dtype='object', length=1025)

In [5]:
features = []
features.extend(list(cols[2:]))
print(len(features))

1023


In [6]:
X = df_train.filter(features, axis=1)
y = df_train.filter(['TripType'], axis=1)
test_x = df_test.filter(features, axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.33, random_state=42)

In [8]:
clf = RandomForestClassifier(max_depth=21, max_features=500)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
predictions = [round(value) for value in preds]
precision_recall_fscore_support(y_test, predictions, average='micro')

(0.70154879168910145, 0.70154879168910145, 0.70154879168910145, None)

In [9]:
pred = clf.predict_proba(X_test)
eval = log_loss(y_test,pred) 
eval

1.3607557620540784

In [14]:
#print(X_test.shape)
clf.fit(X, y.values.ravel())
#print(clf.n_features_)
preds = clf.predict(test_x)
predictions = [round(value) for value in preds]
p = clf.predict_proba(test_x)

In [13]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [18, 20, 25, 100],
    'max_features': ['log2', 500, 700, 'auto'],
    'n_estimators': [10, 100, 200]
}

In [14]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring='neg_log_loss',
                          cv = 3, n_jobs = -1, verbose = 2)

In [15]:
grid_search.fit(X, y.values.ravel())

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=10 
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=10 
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=10 
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=18, max_features=log2, n_estimators=10, total=   2.9s
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=18, max_features=log2, n_estimators=10, total=   3.3s
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=18, max_features=log2, n_estimators=10, total=   3.5s
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=200 
[CV]  bootstrap=True, max_depth=18, max_features=log2, n_estimators=100, total=  22.2s
[CV] bootstrap=True, max_depth=18, max_features=log2, n_estimators=200 
[CV]  bootstrap=True

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 280.4min


[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=10, total=   4.6s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=10 
[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=10, total=   4.8s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=10 
[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=10, total=   5.0s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=100, total=  24.9s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=100, total=  24.4s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=100 
[CV]  bootstrap=True, max_depth=20, max_features=log2, n_estimators=100, total=  25.9s
[CV] bootstrap=True, max_depth=20, max_features=log2, n_estimators=200 
[CV]  bootstrap=True, max_depth=20, max_features=lo

[CV]  bootstrap=True, max_depth=25, max_features=500, n_estimators=100, total=11.0min
[CV] bootstrap=True, max_depth=25, max_features=500, n_estimators=200 
[CV]  bootstrap=True, max_depth=25, max_features=500, n_estimators=100, total=10.9min
[CV] bootstrap=True, max_depth=25, max_features=700, n_estimators=10 .
[CV]  bootstrap=True, max_depth=25, max_features=700, n_estimators=10, total= 1.5min
[CV] bootstrap=True, max_depth=25, max_features=700, n_estimators=10 .
[CV]  bootstrap=True, max_depth=25, max_features=700, n_estimators=10, total= 1.5min
[CV] bootstrap=True, max_depth=25, max_features=700, n_estimators=10 .
[CV]  bootstrap=True, max_depth=25, max_features=700, n_estimators=10, total= 1.7min
[CV] bootstrap=True, max_depth=25, max_features=700, n_estimators=100 
[CV]  bootstrap=True, max_depth=25, max_features=500, n_estimators=200, total=23.4min
[CV] bootstrap=True, max_depth=25, max_features=700, n_estimators=100 
[CV]  bootstrap=True, max_depth=25, max_features=700, n_estim

[CV]  bootstrap=True, max_depth=100, max_features=auto, n_estimators=100, total= 2.0min
[CV] bootstrap=True, max_depth=100, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=auto, n_estimators=200, total= 4.0min
[CV] bootstrap=True, max_depth=100, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=auto, n_estimators=200, total= 4.0min
[CV] bootstrap=True, max_depth=100, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, max_depth=100, max_features=auto, n_estimators=200, total= 4.0min
[CV]  bootstrap=True, max_depth=100, max_features=700, n_estimators=200, total=41.6min
[CV]  bootstrap=True, max_depth=100, max_features=700, n_estimators=200, total=41.4min
[CV]  bootstrap=True, max_depth=100, max_features=700, n_estimators=200, total=34.9min


[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 560.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [18, 20, 25, 100], 'max_features': ['log2', 500, 700, 'auto'], 'n_estimators': [10, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=2)

In [16]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_rf = grid_search.best_estimator_
print(best_params)
print(best_score)

{'bootstrap': True, 'max_depth': 25, 'max_features': 500, 'n_estimators': 200}
-1.02819846561


In [10]:
best_rf = RandomForestClassifier(bootstrap= True, max_depth= 25, max_features= 500, n_estimators= 200)
best_rf.fit(X, y.values.ravel())
#print(clf.n_features_)
preds = best_rf.predict(test_x)
predictions = [round(value) for value in preds]
p = best_rf.predict_proba(test_x)

In [11]:
targets = list(np.unique(y))
visits = df_test.VisitNumber.values #data2 is test dataframe
targets = ["TripType_"+str(int(i)) for i in targets]
out = pd.DataFrame(p,columns = targets,index=None) #p is predicted probs
out.insert(0,'VisitNumber',visits)
out.to_csv("output/output_best_rf2.csv",index=False)

<h3>XG Boost</h3>

In [19]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [22]:
#predictors = [x for x in train.columns if x not in [target, IDcol]]
#predictors = y
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb1.fit(X, y.values.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)

In [25]:
p = clf.predict_proba(test_x)
targets = list(np.unique(y))
visits = df_test.VisitNumber.values #data2 is test dataframe
targets = ["TripType_"+str(int(i)) for i in targets]
out = pd.DataFrame(p,columns = targets,index=None) #p is predicted probs
out.insert(0,'VisitNumber',visits)
out.to_csv("output_xgb.csv",index=False)

In [32]:
dtrain = xgb.DMatrix(np.asarray(X_train), label = np.asarray(y_train))
dtest = xgb.DMatrix(np.asarray(X_test), label = np.asarray(y_test))

In [30]:
print(dtrain)

<xgboost.core.DMatrix object at 0x11d1edfd0>


In [33]:
num_round = 200
param = {'objective': 'multi:softprob', 'num_class':38, 
     'eval_metric': 'mlogloss', "max_delta_step": 5}
watchlist = [(dtrain,'train'), (dtest, 'eval')]

In [34]:
bst = xgb.train(param, dtrain, num_round, watchlist, 
            early_stopping_rounds=3)


XGBoostError: b'[10:31:24] src/objective/multiclass_obj.cc:77: Check failed: label_error >= 0 && label_error < nclass SoftmaxMultiClassObj: label must be in [0, num_class), num_class=38 but found 39 in label.\n\nStack trace returned 2 entries:\n[bt] (0) 0   libxgboost.dylib                    0x000000011e5541c9 _ZN4dmlc15LogMessageFatalD1Ev + 41\n[bt] (1) 1   libstdc++.6.dylib                   0x0000000142cc1300 _ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE + 16\n'