In [58]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import math

In [59]:
%%time
# make blank results df
results_df = pd.DataFrame(columns = ['n_clusters', 'cv_accuracy', 'final_accuracy', 'best_params'])
for n in [50, 100, 200, 300, 500]:
    # load data
    print('loading data')
    train_df = pd.read_csv('cluster_dfs/train_kmeans%s.csv'%n)
    test_df = pd.read_csv('cluster_dfs/test_kmeans%s.csv'%n)
    # make X and y
    feat_cols = ['cluster_%s'%i for i in range(0, n)]
    Xtrain = train_df[feat_cols].values
    Xtest = test_df[feat_cols].values
    ytrain = list(train_df['material'])
    ytest = list(test_df['material'])
    # grid search
    print('cv grid search:')
    forest_cv = model_selection.GridSearchCV(RandomForestClassifier(),
                                          {'n_estimators':[1000],
                                            'max_features':[math.floor(i) for i in np.linspace(2, n, 10)],
                                            'n_jobs':[4]},
                                            cv = 5,
                                            scoring = 'accuracy')
    forest_cv.fit(Xtrain, ytrain)
    forest_best_params = forest_cv.best_params_
    print(forest_best_params)
    for mean, std, params in zip(forest_cv.cv_results_['mean_test_score'],
                                forest_cv.cv_results_['std_test_score'],
                                forest_cv.cv_results_['params']):
        print('  random forest cv: \taccuracy = %0.3f (+/=%0.3f) for %s'%(mean, std*2, params))
    forest_pred_cv = forest_cv.predict(Xtest)
    forest_cv_accuracy = metrics.accuracy_score(ytest, forest_pred_cv)
    print('  cv accuracy on test data: %s'%forest_cv_accuracy)
    
    forest = RandomForestClassifier(n_estimators = 1000,
                                   max_features = forest_best_params['max_features'])
    # final model
    forest.fit(Xtrain, ytrain)
    forest_pred = forest.predict(Xtest)
    forest_accuracy = metrics.accuracy_score(ytest, forest_pred)
    print('final accuracy: %s'%forest_accuracy)
    # add to results df
    results = {
        'n_clusters':n,
        'cv_accuracy':forest_cv_accuracy,
        'final_accuracy':forest_accuracy,
        'best_params':forest_best_params
    }
    results_df = results_df.append(results, ignore_index=True)

loading data
cv grid search:
{'max_features': 23, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.405 (+/=0.103) for {'max_features': 2, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.432 (+/=0.117) for {'max_features': 7, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.430 (+/=0.105) for {'max_features': 12, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.426 (+/=0.118) for {'max_features': 18, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.438 (+/=0.126) for {'max_features': 23, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.421 (+/=0.106) for {'max_features': 28, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.432 (+/=0.094) for {'max_features': 34, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.415 (+/=0.104) for {'max_features': 39, 'n_estimators': 1000, 'n_jobs': 4}
  random forest cv: 	accuracy = 0.415 (+/=0.1

In [23]:
train_df = pd.read_csv('cluster_dfs/train_kmeans50.csv')
test_df = pd.read_csv('cluster_dfs/test_kmeans50.csv')

In [24]:
n = 50
feat_cols = ['cluster_%s'%i for i in range(0, n)]
Xtrain = train_df[feat_cols].values
Xtest = test_df[feat_cols].values

ytrain = list(train_df['material'])
ytest = list(test_df['material'])

In [47]:
forest_cv = model_selection.GridSearchCV(RandomForestClassifier(),
                                      {'n_estimators':[1000],
                                       'max_features':[math.floor(i) for i in np.linspace(2, n, 10)],
                                        'n_jobs':[4]},
                                      cv = 5,
                                      scoring = 'accuracy')

In [48]:
%%time
forest_cv.fit(Xtrain, ytrain)

Wall time: 1min 18s


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [49]:
forest_best_params = forest_cv.best_params_
print(forest_best_params)

{'max_features': 28, 'n_estimators': 1000, 'n_jobs': 4}


In [50]:
for mean, std, params in zip(forest_cv.cv_results_['mean_test_score'],
                            forest_cv.cv_results_['std_test_score'],
                            forest_cv.cv_results_['params']):
    print('random forest: \taccuracy = %0.3f (+/=%0.3f) for %s'%(mean, std*2, params))

random forest: 	accuracy = 0.419 (+/=0.079) for {'max_features': 2, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.436 (+/=0.078) for {'max_features': 7, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.434 (+/=0.115) for {'max_features': 12, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.428 (+/=0.090) for {'max_features': 18, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.440 (+/=0.079) for {'max_features': 23, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.442 (+/=0.070) for {'max_features': 28, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.415 (+/=0.061) for {'max_features': 34, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.413 (+/=0.066) for {'max_features': 39, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.415 (+/=0.105) for {'max_features': 44, 'n_estimators': 1000, 'n_jobs': 4}
random forest: 	accuracy = 0.411 (+/=0.079) for {'max_features': 5

In [51]:
forest_pred_cv = forest_cv.predict(Xtest)
forest_cv_accuracy = metrics.accuracy_score(ytest, forest_pred_cv)
print('cv accuracy on test data: %s'%forest_cv_accuracy)

cv accuracy on test data: 0.4059059059059059


In [54]:
%%time
forest = RandomForestClassifier(n_estimators = 1000,
                               max_features = forest_best_params['max_features'])
forest.fit(Xtrain, ytrain)
forest_pred = forest.predict(Xtest)
forest_accuracy = metrics.accuracy_score(ytest, forest_pred)
print('final accuracy: %s'%forest_accuracy)

final accuracy: 0.4024024024024024
Wall time: 4.31 s


In [55]:
results = {
    'n_clusters':n,
    'cv_accuracy':forest_cv_accuracy,
    'final_accuracy':forest_accuracy,
    'best_params':forest_best_params
}

results_df = results_df.append(results, ignore_index=True)

In [60]:
results_df

Unnamed: 0,n_clusters,cv_accuracy,final_accuracy,best_params
0,50,0.41041,0.417417,"{'max_features': 23, 'n_estimators': 1000, 'n_..."
1,100,0.387387,0.375876,"{'max_features': 34, 'n_estimators': 1000, 'n_..."
2,200,0.348849,0.351351,"{'max_features': 2, 'n_estimators': 1000, 'n_j..."
3,300,0.331832,0.335335,"{'max_features': 2, 'n_estimators': 1000, 'n_j..."
4,500,0.348348,0.342843,"{'max_features': 57, 'n_estimators': 1000, 'n_..."
