In [72]:
from pandas import read_csv, get_dummies
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report, confusion_matrix

df = read_csv("./data/sample.csv")
df.tail()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,origin,Labels
380,18.0,250.0,88,3021,16.5,US,15
381,27.0,151.0,90,2950,17.3,US,10
382,29.5,98.0,68,2135,16.6,Asia,10
383,17.5,250.0,110,3520,16.4,US,15
384,25.1,140.0,88,2720,15.4,US,10


In [73]:
X = df.drop("Labels", axis=1)
y = df.Labels

In [74]:
#replace origin with label encoder
label_encoder = LabelEncoder()
x_origin_encoded = label_encoder.fit_transform(y)
x_origin_encoded

array([1, 2, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0, 1, 1, 2,
       2, 0, 0, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0,
       0, 0, 2, 0, 1, 2, 2, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 2,
       2, 2, 0, 2, 0, 2, 2, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 2, 2, 0,
       2, 0, 2, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 2,
       0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 2, 2,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 2, 2, 0, 2, 0, 0, 2,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0, 2, 1, 2, 2, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 2, 0, 2,
       0, 0, 2, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 1,
       2, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 0, 1, 2, 2, 0, 2, 1, 0, 2, 2, 2,
       1, 2, 2, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 2,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 2,

In [75]:
#replace origin with one hot encoder
onehot_encoder = OneHotEncoder(sparse=False)
x_origin_encoded = x_origin_encoded.reshape(len(x_origin_encoded), 1)
x_origin_onehot_encoded = onehot_encoder.fit_transform(x_origin_encoded)
x_origin_onehot_encoded

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [76]:
'''
    or you can use using pandas
    drop first : True => drop one column since one hot encoder can be dropped 1 col since the value is always predictable
'''

X = get_dummies(X, columns=["origin"], drop_first=True)
X.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,0,1
1,9.0,304.0,193,4732,18.5,0,1
2,36.1,91.0,60,1800,16.4,0,0
3,18.5,250.0,98,3525,19.0,0,1
4,34.3,97.0,78,2188,15.8,1,0


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((308, 7), (308,), (77, 7), (77,))

In [78]:
pipeline = Pipeline([
    ("RF", RandomForestClassifier())
])
#class weight based on the proportions of number of samples
params = {
    "RF__n_estimators" : [10, 20, 40, 60],
    "RF__max_features" : [0.1, 0.5, 0.8],
    "RF__min_samples_leaf" : [5, 10],
    "RF__class_weight" : ["balanced"]
}

#scoring f1 weighted is for multiclass

RF = GridSearchCV(pipeline,param_grid=params,cv=3, scoring="f1_weighted")
RF.fit(X_train,y_train)

RF.best_params_

{'RF__class_weight': 'balanced',
 'RF__max_features': 0.1,
 'RF__min_samples_leaf': 5,
 'RF__n_estimators': 40}

In [79]:
RF.grid_scores_



[mean: 0.93570, std: 0.01692, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 5, 'RF__n_estimators': 10},
 mean: 0.96112, std: 0.02899, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 5, 'RF__n_estimators': 20},
 mean: 0.97719, std: 0.01239, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 5, 'RF__n_estimators': 40},
 mean: 0.96771, std: 0.02553, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 5, 'RF__n_estimators': 60},
 mean: 0.92991, std: 0.00904, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 10, 'RF__n_estimators': 10},
 mean: 0.95527, std: 0.01560, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_samples_leaf': 10, 'RF__n_estimators': 20},
 mean: 0.96451, std: 0.01829, params: {'RF__class_weight': 'balanced', 'RF__max_features': 0.1, 'RF__min_sampl

In [80]:
RF.score(X_train, y_train), RF.score(X_test, y_test)

(0.9775521364072529, 0.9869846465591146)

In [87]:
classification_report_train = classification_report(y_train, RF.predict(X_train))
classification_report_train

'             precision    recall  f1-score   support\n\n         10       0.99      0.99      0.99       165\n         15       0.91      0.98      0.94        60\n         20       1.00      0.95      0.98        83\n\navg / total       0.98      0.98      0.98       308\n'

In [88]:
classification_report_test = classification_report(y_test, RF.predict(X_test))
classification_report_test

'             precision    recall  f1-score   support\n\n         10       1.00      1.00      1.00        34\n         15       0.96      1.00      0.98        23\n         20       1.00      0.95      0.97        20\n\navg / total       0.99      0.99      0.99        77\n'

In [85]:
#for multiclass classification the average should be None
f1_train = f1_score(y_train, RF.predict(X_train), average=None)
f1_train

array([0.99088146, 0.944     , 0.97530864])

In [84]:
#for multiclass classification the average should be None
f1_test = f1_score(y_test, RF.predict(X_test), average=None)
f1_test

array([1.        , 0.9787234 , 0.97435897])

In [92]:
confusion_matrix_train = confusion_matrix(y_train, RF.predict(X_train), labels=[10,15,20])
confusion_matrix_train

array([[163,   2,   0],
       [  1,  59,   0],
       [  0,   4,  79]], dtype=int64)

In [91]:
confusion_matrix_test = confusion_matrix(y_test, RF.predict(X_test), labels=[10,15,20])
confusion_matrix_test

array([[34,  0,  0],
       [ 0, 23,  0],
       [ 0,  1, 19]], dtype=int64)