In [1]:
# add for loops

In [2]:
import pandas as pd # dataframes
import numpy as np # maths
import matplotlib.pyplot as plt # plots
import seaborn as sns # nicer plots

from sklearn.model_selection import train_test_split # splitting dataframes and subsplits
import time # model compuational expense
from numpy import array # use for getting one hot codings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # get binary outcomes
from sklearn import ensemble # models random forest and boosted tree

from sklearn.svm import SVC # support vector machines
from sklearn.preprocessing import StandardScaler  
from sklearn.linear_model import LogisticRegression 

# evaluation metrics
from sklearn.metrics import classification_report # f1 scores
from sklearn.metrics import confusion_matrix # prediction vs actual
from sklearn.metrics import accuracy_score # overall score
from sklearn.model_selection import cross_val_score # check for overfitting and model stability

sns.set(style="darkgrid") # plotting if any
# plot formatting
%matplotlib inline

In [3]:
# get dataset and take a look
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [4]:
# one hot encoding for outcome variables
data = df.Cover_Type
values = array(data)
print(values)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
type(onehot_encoded)

[5 5 2 ... 3 3 3]
[4 4 1 ... 2 2 2]
[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


numpy.ndarray

In [5]:
# make dataframe of binary outcomes for 7 cover types
data = onehot_encoded
dataset = pd.DataFrame({'bin_type1':data[:,0],'bin_type2':data[:,1],'bin_type3':data[:,2],'bin_type4':data[:,3],
                       'bin_type5':data[:,4],'bin_type6':data[:,5],'bin_type7':data[:,6]})
dataset.head()

Unnamed: 0,bin_type1,bin_type2,bin_type3,bin_type4,bin_type5,bin_type6,bin_type7
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# combine dataframes to get binary outcome attached to cover types
# make sure they match
df = pd.concat([df, dataset], axis=1)
df.tail()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type39,Soil_Type40,Cover_Type,bin_type1,bin_type2,bin_type3,bin_type4,bin_type5,bin_type6,bin_type7
15115,15116,2607,243,23,258,7,660,170,251,214,...,0,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15116,15117,2603,121,19,633,195,618,249,221,91,...,0,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15117,15118,2492,134,25,365,117,335,250,220,83,...,0,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15118,15119,2487,167,28,218,101,242,229,237,119,...,0,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15119,15120,2475,197,34,319,78,270,189,244,164,...,0,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
# check value counts are correct
bin_type_list = df.iloc[:, 56:].columns.tolist()
for bin_type in bin_type_list:
    print(df[bin_type].value_counts())

0.0    12960
1.0     2160
Name: bin_type1, dtype: int64
0.0    12960
1.0     2160
Name: bin_type2, dtype: int64
0.0    12960
1.0     2160
Name: bin_type3, dtype: int64
0.0    12960
1.0     2160
Name: bin_type4, dtype: int64
0.0    12960
1.0     2160
Name: bin_type5, dtype: int64
0.0    12960
1.0     2160
Name: bin_type6, dtype: int64
0.0    12960
1.0     2160
Name: bin_type7, dtype: int64


In [8]:
df.describe()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type39,Soil_Type40,Cover_Type,bin_type1,bin_type2,bin_type3,bin_type4,bin_type5,bin_type6,bin_type7
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,...,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,7560.5,2749.322553,156.676653,16.501587,227.195701,51.076521,1714.023214,212.704299,218.965608,135.091997,...,0.043452,0.030357,4.0,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857
std,4364.91237,417.678187,110.085801,8.453927,210.075296,61.239406,1325.066358,30.561287,22.801966,45.895189,...,0.20388,0.171574,2.000066,0.349939,0.349939,0.349939,0.349939,0.349939,0.349939,0.349939
min,1.0,1863.0,0.0,0.0,0.0,-146.0,0.0,0.0,99.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3780.75,2376.0,65.0,10.0,67.0,5.0,764.0,196.0,207.0,106.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7560.5,2752.0,126.0,15.0,180.0,32.0,1316.0,220.0,223.0,138.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,11340.25,3104.0,261.0,22.0,330.0,79.0,2270.0,235.0,235.0,167.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15120.0,3849.0,360.0,52.0,1343.0,554.0,6890.0,254.0,254.0,248.0,...,1.0,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# now split df into training and testing
# training will have validation set too
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.Cover_Type)

In [10]:
print(df_train.Cover_Type.value_counts())
print(df_test.Cover_Type.value_counts())
# even splits

7    1728
6    1728
5    1728
4    1728
3    1728
2    1728
1    1728
Name: Cover_Type, dtype: int64
7    432
5    432
3    432
1    432
6    432
4    432
2    432
Name: Cover_Type, dtype: int64


In [11]:
df_test.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type39,Soil_Type40,Cover_Type,bin_type1,bin_type2,bin_type3,bin_type4,bin_type5,bin_type6,bin_type7
3551,3552,2311,102,25,525,17,1392,252,199,59,...,0,0,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5928,5929,2315,315,18,134,28,1500,170,223,192,...,0,0,6,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13057,13058,2276,91,16,0,0,1072,243,212,93,...,0,0,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2815,2816,2797,174,23,272,91,600,226,245,138,...,0,0,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12192,12193,2038,15,3,0,0,700,217,233,154,...,0,0,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# function to run models and get evalution metrics
def run_models_scores(model, X_train, y_train, X_test, y_test):
    print(model) # know the model params
    start_time = time.time() 
    model.fit(X_train, y_train) # fit on training
    print("--- %s seconds ---" % (time.time() - start_time))
    y_pred = model.predict(X_test) # get prediction from testset
    # get eval metrics for test set
    print(classification_report(y_test, y_pred)) 
    print(confusion_matrix(y_test, y_pred))
    return (accuracy_score(y_test, y_pred))
# gets only eval metrics
def classreport_confmatrix_score(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return (accuracy_score(y_test, y_pred))

In [13]:
# step 1 - prep to train boosted tree on binary outcomes
# X has no info about cover_type or binary categories
X = df_train.drop(['Cover_Type', 'Id', 'bin_type1', 'bin_type2', 'bin_type3',
                  'bin_type4', 'bin_type5', 'bin_type6', 'bin_type7'], 1)

y1 = df_train['bin_type1']
y2 = df_train['bin_type2']
y3 = df_train['bin_type3']
y4 = df_train['bin_type4']
y5 = df_train['bin_type5']
y6 = df_train['bin_type6']
y7 = df_train['bin_type7']

In [14]:
X.head() # just cartographic info and soil types and wilderness areas

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
14128,2942,151,27,376,191,1928,242,227,96,837,...,0,0,0,0,0,0,0,0,0,0
4425,2869,299,9,90,-11,1670,196,237,181,1460,...,0,0,0,0,0,0,0,0,0,0
909,2928,294,6,216,21,5075,204,239,174,1744,...,0,0,0,0,0,0,0,0,0,0
5123,2827,144,18,120,24,2255,242,234,114,1181,...,0,0,0,0,0,0,0,0,0,0
10901,2559,336,14,30,12,2006,187,220,171,524,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X.shape # does not include cover type, bin types, or id

(12096, 54)

In [16]:
# ys are binary outcomes
y1

14128    0.0
4425     0.0
909      0.0
5123     0.0
10901    0.0
514      1.0
6952     0.0
7615     0.0
14101    0.0
5125     0.0
1696     0.0
9821     1.0
3116     0.0
11820    0.0
2709     0.0
2888     0.0
8385     1.0
13325    0.0
13483    0.0
12230    0.0
2384     0.0
14565    0.0
2001     0.0
14521    1.0
12131    0.0
2395     0.0
7382     1.0
12274    0.0
8485     0.0
14329    1.0
        ... 
2096     0.0
3310     0.0
8282     1.0
12604    0.0
1634     0.0
320      0.0
10375    0.0
10923    0.0
4711     0.0
14240    0.0
4107     0.0
7041     1.0
11064    0.0
14248    0.0
7306     0.0
8044     0.0
5667     0.0
7494     0.0
7709     0.0
435      0.0
997      0.0
11369    1.0
14108    1.0
6179     0.0
13936    0.0
5297     0.0
12296    0.0
8010     0.0
7557     0.0
11291    0.0
Name: bin_type1, Length: 12096, dtype: float64

In [17]:
# step 2
# build first stack of stacked model

params3_5_6 = {'n_estimators': 50,
          'max_depth': 8,
          'loss': 'exponential',
          'max_features': 3,
          }

params4 = {'n_estimators': 100,
          'max_depth': 2,
          'loss': 'exponential',
          'max_features': 2,
          }
params7 = {'n_estimators': 100,
          'max_depth': 3,
          'loss': 'exponential',
          'max_features': 2,
          }

clf1 = ensemble.RandomForestClassifier(criterion='gini', n_estimators=100, n_jobs=-1, max_depth=12)
clf2 = ensemble.RandomForestClassifier(criterion='gini', n_estimators=50, n_jobs=-1, max_depth=16)
clf3 = ensemble.GradientBoostingClassifier(**params3_5_6)
clf4 = ensemble.GradientBoostingClassifier(**params4)
clf5 = ensemble.GradientBoostingClassifier(**params3_5_6)
clf6 = ensemble.GradientBoostingClassifier(**params3_5_6)
clf7 = ensemble.GradientBoostingClassifier(**params7)

model1 = clf1.fit(X, y1)
model2 = clf2.fit(X, y2)
model3 = clf3.fit(X, y3)
model4 = clf4.fit(X, y4)
model5 = clf5.fit(X, y5)
model6 = clf6.fit(X, y6)
model7 = clf7.fit(X, y7)

In [18]:
# use function from above to get scores
binary_model_list = [model1, model2, model3, model4, model5, model6, model7]
ys = [y1, y2, y3, y4, y5, y6, y7]
for model_i, y_i in zip(binary_model_list, ys):
    print(classreport_confmatrix_score(model_i, X, y_i))
# goal was to achieve at least 0.80 for f1-score of class 1 for each of 7 binary models

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     10368
         1.0       0.99      0.76      0.86      1728

   micro avg       0.96      0.96      0.96     12096
   macro avg       0.97      0.88      0.92     12096
weighted avg       0.97      0.96      0.96     12096

[[10350    18]
 [  407  1321]]
0.964864417989418
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     10368
         1.0       1.00      0.77      0.87      1728

   micro avg       0.97      0.97      0.97     12096
   macro avg       0.98      0.89      0.93     12096
weighted avg       0.97      0.97      0.97     12096

[[10367     1]
 [  397  1331]]
0.9670965608465608
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97     10368
         1.0       0.95      0.70      0.80      1728

   micro avg       0.95      0.95      0.95     12096
   macro avg       0.95      

In [19]:
# step 3 get predictions and add to df_train from binary classifiers 
for i, model in zip(range(1, 8), binary_model_list):
        df_train[f'predict{i}'] = model.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is tryin

In [20]:
df_train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,bin_type5,bin_type6,bin_type7,predict1,predict2,predict3,predict4,predict5,predict6,predict7
14128,14129,2942,151,27,376,191,1928,242,227,96,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4425,4426,2869,299,9,90,-11,1670,196,237,181,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
909,910,2928,294,6,216,21,5075,204,239,174,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5123,5124,2827,144,18,120,24,2255,242,234,114,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10901,10902,2559,336,14,30,12,2006,187,220,171,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
# final step in training
# look at random forest
X = df_train.drop(['Cover_Type', 'Id', 'bin_type1', 'bin_type2', 'bin_type3',
                  'bin_type4', 'bin_type5', 'bin_type6', 'bin_type7'], 1)
y = df_train['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rfc = ensemble.RandomForestClassifier(criterion='gini', 
                                      n_estimators=250,
                                      n_jobs=-1,
                                      max_features='auto',
                                      max_depth=None,
                                     )

# using function from beginning to run model and get scores
print(run_models_scores(rfc, X_train, y_train, X_val, y_val))
print()
print('training')
scores = cross_val_score(rfc, X_train, y_train, cv=10)
print(scores, scores.mean(), scores.std())
print('validating')
scores = cross_val_score(rfc, X_val, y_val, cv=10)
print(scores, scores.mean(), scores.std())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
--- 1.442889928817749 seconds ---
              precision    recall  f1-score   support

           1       0.95      0.89      0.92       345
           2       0.93      0.90      0.92       345
           3       0.88      0.79      0.83       346
           4       0.89      0.98      0.93       346
           5       0.95      0.95      0.95       346
           6       0.85      0.88      0.86       346
           7       0.94      0.99      0.97       346

   micro avg       0.91      0.91      0.91      2420
   macro avg       0.91      0.91      0.91      2420
w

In [22]:
# see if boosted tree can to better than random forest as second stack
params_second = {'n_estimators': 100,
          'max_depth': None,
          'max_features': 'auto',
          }

boosted_tree = ensemble.GradientBoostingClassifier(**params_second)
print(run_models_scores(boosted_tree, X_train, y_train, X_val, y_val))


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
--- 70.2033748626709 seconds ---
              precision    recall  f1-score   support

           1       0.93      0.92      0.93       345
           2       0.90      0.90      0.90       345
           3       0.85      0.80      0.82       346
           4       0.93      0.96      0.94       346
           5       0.94      0.93      0.93       346
           6       0.83      0.86      0.84       346
           7       0.96      0.97     

In [23]:
# can we obtain good results on test set?

# step 1 - prep to get predictions

X = df_test.drop(['Cover_Type', 'Id', 'bin_type1', 'bin_type2', 'bin_type3',
                  'bin_type4', 'bin_type5', 'bin_type6', 'bin_type7'], 1)

# skip to step 3 - get predictions into df_test

for i, model in zip(range(1, 8), binary_model_list):
        df_test[f'predict{i}'] = model.predict(X)

# last last step if it works
# look at random forest

Xrf = df_test.drop(['Cover_Type', 'Id', 'bin_type1', 'bin_type2', 'bin_type3',
                  'bin_type4', 'bin_type5', 'bin_type6', 'bin_type7'], 1)
yrf = df_test['Cover_Type']

# same rf model used on training set
print(rfc)
print('testing!')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
testing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [24]:
classreport_confmatrix_score(rfc, Xrf, yrf)

              precision    recall  f1-score   support

           1       0.72      0.71      0.71       432
           2       0.71      0.58      0.64       432
           3       0.81      0.73      0.77       432
           4       0.90      0.99      0.94       432
           5       0.86      0.89      0.87       432
           6       0.79      0.83      0.81       432
           7       0.89      0.99      0.93       432

   micro avg       0.82      0.82      0.82      3024
   macro avg       0.81      0.82      0.81      3024
weighted avg       0.81      0.82      0.81      3024

[[307  67   1   0  10   0  47]
 [107 249  14   0  37  18   7]
 [  0   6 317  34   9  66   0]
 [  0   0   3 427   0   2   0]
 [  7  23  11   0 384   7   0]
 [  1   4  45  16   6 360   0]
 [  6   0   0   0   0   0 426]]


0.8167989417989417

In [25]:
# use boosted tree
classreport_confmatrix_score(boosted_tree, Xrf, yrf)

              precision    recall  f1-score   support

           1       0.70      0.71      0.70       432
           2       0.70      0.58      0.63       432
           3       0.78      0.73      0.75       432
           4       0.91      0.96      0.94       432
           5       0.85      0.85      0.85       432
           6       0.78      0.83      0.80       432
           7       0.88      0.96      0.92       432

   micro avg       0.80      0.80      0.80      3024
   macro avg       0.80      0.80      0.80      3024
weighted avg       0.80      0.80      0.80      3024

[[305  67   1   0   9   0  50]
 [106 250  12   0  39  18   7]
 [  0   8 316  31   8  69   0]
 [  0   0  12 415   1   4   0]
 [ 10  29  13   0 368  12   0]
 [  0   5  52   9   8 358   0]
 [ 17   0   0   0   1   0 414]]


0.8022486772486772