In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.model_selection import cross_val_score

from scripts.tools import df_check_stats, game, sam_pickle_save, sam_pickle_load

np.set_printoptions(precision=5)
np.random.seed(69572)
plt.style.use('ggplot')
sns.set(color_codes=True)

%matplotlib inline

In [2]:
crazy_list = dir()

In [3]:
for each in dir():
    if each not in crazy_list:
        del each

## MultiClass

In [4]:
X, y, TEST_X = sam_pickle_load(prefix="tmp/Iteration2_final_")
df_check_stats(X, y, TEST_X)
# print('--')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42, stratify=y)
# clf = game(X_train, X_test, y_train, y_test, algo='rf',)

LOAD PREFIX USED:  tmp/Iteration2_final_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Numpy Array Size: 59400
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0


In [5]:
clf = OneVsOneClassifier(RandomForestClassifier(random_state=192))

scores = cross_val_score(clf, X, y, cv=5)

In [6]:
sum(scores) / 5

0.80223892375807926

In [7]:
clf = OneVsRestClassifier(RandomForestClassifier(random_state=192))

scores = cross_val_score(clf, X, y, cv=5)

In [8]:
sum(scores) / 5

0.79994905249969661

## Fine Tuning

In [9]:
X, y, TEST_X = sam_pickle_load(prefix="tmp/Iteration2_final_")
df_check_stats(X, y, TEST_X)
print('--')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42, stratify=y)
# clf = game(X_train, X_test, y_train, y_test, algo='rf',)

LOAD PREFIX USED:  tmp/Iteration2_final_
Data Frame Shape: (59400, 43) TotColumns: 43 ObjectCols: 0
Numpy Array Size: 59400
Data Frame Shape: (14850, 43) TotColumns: 43 ObjectCols: 0
--


In [5]:
parameters = {
    'n_estimators': [10, 50, 100, 150, 200],
    'class_weight': ['balanced_subsample', 'balanced'],
    'criterion': ['gini', 'entropy'],
    'max_features': ['log2', 'auto', 25],
    'random_state': [192]
}

# clf_rf = RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1, random_state=192)
# 0.81346801346801345

GS_CV = RandomizedSearchCV(RandomForestClassifier(), parameters)

GS_CV.fit(X, y)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_features': ['log2', 'auto', 25], 'random_state': [192], 'n_estimators': [10, 50, 100, 150, 200], 'class_weight': ['balanced_subsample', 'balanced'], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [7]:
print(GS_CV.best_params_, GS_CV.best_score_)

cv_results = pd.DataFrame(GS_CV.cv_results_, columns=[u'mean_fit_time', u'mean_score_time', u'mean_test_score',
       u'mean_train_score', u'param_class_weight', u'param_criterion',
       u'param_max_features', u'param_n_estimators', u'params'])

{'class_weight': 'balanced', 'random_state': 192, 'n_estimators': 200, 'criterion': 'gini', 'max_features': 'auto'} 0.807053872054


In [8]:
cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_class_weight,param_criterion,param_max_features,param_n_estimators,params
0,19.282804,1.360089,0.807054,0.999949,balanced,gini,auto,200,"{'class_weight': 'balanced', 'random_state': 1..."
1,66.549496,0.916581,0.805943,0.999949,balanced_subsample,entropy,25,150,"{'class_weight': 'balanced_subsample', 'random..."
2,5.008238,0.325644,0.805152,0.999377,balanced_subsample,gini,auto,50,"{'class_weight': 'balanced_subsample', 'random..."
3,1.137362,0.064202,0.794343,0.984655,balanced_subsample,entropy,log2,10,"{'class_weight': 'balanced_subsample', 'random..."
4,0.909334,0.067301,0.794024,0.984453,balanced_subsample,gini,log2,10,"{'class_weight': 'balanced_subsample', 'random..."
5,8.789265,0.646884,0.806178,0.999933,balanced_subsample,gini,log2,100,"{'class_weight': 'balanced_subsample', 'random..."
6,86.707606,1.251047,0.806145,0.999949,balanced_subsample,entropy,25,200,"{'class_weight': 'balanced_subsample', 'random..."
7,41.695712,0.599706,0.805556,0.999924,balanced,entropy,25,100,"{'class_weight': 'balanced', 'random_state': 1..."
8,66.118829,1.253683,0.80532,0.999949,balanced,gini,25,200,"{'class_weight': 'balanced', 'random_state': 1..."
9,32.72524,0.642278,0.80463,0.999916,balanced,gini,25,100,"{'class_weight': 'balanced', 'random_state': 1..."


# Results

In [20]:
GS_CV.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 200,
 'random_state': 192}

In [10]:
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, criterion='gini', max_features='auto', random_state=192))

scores = cross_val_score(clf, X, y, cv=5)

print(scores)

[ 0.81794  0.81357  0.81288  0.81044  0.80872]


In [11]:
sum(scores)/ 5

0.8127102008672592

# Submit

In [12]:
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, criterion='gini', max_features='auto', random_state=192))
clf = clf.fit(X, y)

In [16]:
import pickle

le = pickle.load(open('tmp/le.pkl', 'rb'))

In [17]:

# saving the index
test_ids = TEST_X.index

# predicint the values
predictions = clf.predict(TEST_X)
print(predictions.shape)

# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)

# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()

(14850,)


Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
