In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV


#Load the data 
train=pd.read_csv('train.csv')

#All features, that do not require further preprocessing: 
#['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee']
feature_cols=['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee']

#Define y and X
y=train.AdoptionSpeed
X_unscaled=train[feature_cols]
#Scale all features
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X_unscaled)
X=pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

  from numpy.core.umath_tests import inner1d


In [2]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 1500, num = 500)]
# Number of features to consider at every split
# max_features = ['sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 11, num = 2)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [20,25,30,35,40,45,50,55,60,65,70]
# Minimum number of samples required at each leaf node
min_samples_leaf = [5,6]
# Method of selecting samples for training each tree
#bootstrap = [True]


In [3]:
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }

# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
clf_random.fit(X_train, y_train)
clf_random.best_params_
print(clf_random.best_params_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 26.8min finished


{'n_estimators': 1051, 'min_samples_split': 70, 'max_depth': 10, 'min_samples_leaf': 5}


In [None]:
def evaluate(model, X_test, y_test):
    y_pred= model.predict( X_test)
    score=cohen_kappa_score(y_pred, y_test, weights='quadratic')
    print('Model Performance')
    print('Score: ', score)
    return score

base_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
base_model.fit(X_train, y_train)
base_score = evaluate(base_model, X_test, y_test)

best_random = clf_random.best_estimator_
random_score = evaluate(best_random, X_test, y_test)

print('Improvement of ', base_score-random_score)

#Use RandomForest
#X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=0, shuffle=False)
#X, y = make_classification(random_state=0)
#clf = RandomForestClassifier(n_estimators=100, max_depth=29, random_state=1)
#clf.fit(X_train, y_train)
#y_pred=clf.predict(X_test)


#Print out the quadratic weighted kappa score
#print(cohen_kappa_score(y_pred, y_test, weights='quadratic'))