# Rand Forest Updated

In [10]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from sklearn.ensemble import ExtraTreesClassifier # for future work
# from sklearn.tree import DecisionTreeClassifier # for future work
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from sklearn.metrics import (confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, 
                            classification_report, multilabel_confusion_matrix, mean_squared_error, mean_absolute_error)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
import numpy as np
import glob, json

In [2]:
# Load file names and labels for the processed data
data_folder_prefix = "../Seg_Featured_Data_Updated/Seg_Featured_"

with open("../data_labels.json", 'r') as json_file:
    label_dict = json.load(json_file)

data_files = glob.glob("../Seg_Featured_Data_Updated/Seg_Featured_*")
avalible_files=[]
for fl in data_files:
    avalible_files.append(fl[len("../Seg_Featured_Data_Updated\Seg_Featured_"):-len(".npy")])

print(len(avalible_files), avalible_files[:5])
# print(avalible_files[0], list(label_dict.keys())[0])


file_names = set.intersection(set(avalible_files), set(list(label_dict.keys())))
print(len(file_names))

labels = []
for fl in file_names:
    labels.append(label_dict[fl])
print(len(labels))

files = [data_folder_prefix+x+".npy" for x in file_names]


995 ['Data_20120330_01_004_0', 'Data_20120330_01_004_1', 'Data_20120330_01_004_10', 'Data_20120330_01_004_11', 'Data_20120330_01_004_12']
634
634


In [11]:
ct, values = np.unique(labels, return_counts=True)
print(ct, values, values/values.sum())

pred = [4]* len(files)
accuracy_score(labels, pred)

[0 1 2 3 4] [ 20   6  22 120 466] [0.03154574 0.00946372 0.03470032 0.18927445 0.73501577]


0.7350157728706624

In [3]:
max_len = len([0]*576+[0]*576+[0]*576+[0]*250*2+[0]*1) 
flattened_test_load = np.zeros((max_len, ))
print(flattened_test_load.shape)

for fl in files:
    data = np.load(fl, None, allow_pickle=True)
    flattened_test_load = np.vstack((flattened_test_load, data))

print(flattened_test_load.shape)

flattened_data = flattened_test_load[1:,:]
flattened_data.shape, len(labels)


(2229,)
(635, 2229)


((634, 2229), 634)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(flattened_data, labels, test_size=0.2, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

507 127 507 127


In [13]:
ct, values = np.unique(y_test, return_counts=True)
print(ct, values, values/values.sum())

pred = [4]* len(y_test)
accuracy_score(y_test, pred)

test_weights = values/values.sum()

[0 1 2 3 4] [ 4  2 10 27 84] [0.03149606 0.01574803 0.07874016 0.21259843 0.66141732]


In [14]:
# create Random Forest
clf0 = RandomForestClassifier(
    n_estimators=500, 
    max_depth=None,
    min_samples_split=10, 
    class_weight='balanced'
    )

clf0.fit(X_train, y_train)
pred_labels = clf0.predict(X_test)
print('Acc',' MR',' MP',' MF1',' MF1',' MAE',' MSE\n', 
    clf0.score(X_test, y_test),
    recall_score(y_test, pred_labels, average='macro'),
    precision_score(y_test, pred_labels, average='macro'),
    f1_score(y_test, pred_labels,average='macro'),
    mean_absolute_error(y_test, pred_labels),
    mean_squared_error(y_test, pred_labels)
)
print(classification_report(y_test, pred_labels))
print(confusion_matrix(y_test, pred_labels, sample_weight=compute_sample_weight('balanced', y_test)))



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Acc  MR  MP  MF1  MF1  MAE  MSE
 0.6299212598425197 0.19047619047619047 0.13559322033898305 0.15841584158415842 0.5826771653543307 1.2125984251968505
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        27
           4       0.68      0.95      0.79        84

    accuracy                           0.63       127
   macro avg       0.14      0.19      0.16       127
weighted avg       0.45      0.63      0.52       127



ValueError: Found input variables with inconsistent numbers of samples: [127, 127, 5]

In [6]:
# # Get accuracy
# acc = clf0.score(X_test, y_test)

# pred_labels = clf0.predict(X_test)

# print('Acc:', acc,'\nmR',recall_score(y_test, pred_labels, average='micro'),'\nmP', precision_score(y_test, pred_labels, average='micro'),
# '\nMR',recall_score(y_test, pred_labels, average='macro'),'\nMP', precision_score(y_test, pred_labels, average='macro'),
# '\nmF1',f1_score(y_test, pred_labels,average='micro'),'\nMF1', f1_score(y_test, pred_labels,average='macro'))

# # print(classification_report(y_test, pred_labels))

In [15]:
# create Random Forest
clf_1 = RandomForestClassifier(
    n_estimators=1000, max_depth=None,
    min_samples_split=10,
    class_weight='balanced'
    )

clf_1.fit(X_train, y_train)

pred_labels_1 = clf_1.predict(X_test)

# print('Acc',clf_1.score(X_test, y_test),
# '\nMR',recall_score(y_test, pred_labels_1, average='macro'),
# '\nMP', precision_score(y_test, pred_labels_1, average='macro'),
# '\nMF1', f1_score(y_test, pred_labels_1,average='macro'),
# '\nMAE', mean_absolute_error(y_test, pred_labels_1),
# '\nMSE', mean_squared_error(y_test, pred_labels_1)
# )

print('Acc',' MR',' MP',' MF1',' MF1',' MAE',' MSE\n', 
    clf_1.score(X_test, y_test),
    recall_score(y_test, pred_labels_1, average='macro'),
    precision_score(y_test, pred_labels_1, average='macro'),
    f1_score(y_test, pred_labels_1,average='macro'),
    mean_absolute_error(y_test, pred_labels_1),
    mean_squared_error(y_test, pred_labels_1)
)
print(classification_report(y_test, pred_labels_1))
print(confusion_matrix(y_test, pred_labels_1, sample_weight=compute_sample_weight('balanced', y_test)))


Acc  MR  MP  MF1  MF1  MAE  MSE
 0.6377952755905512 0.2455026455026455 0.26004273504273506 0.23530916844349684 0.5433070866141733 1.078740157480315
              precision    recall  f1-score   support

           0       0.50      0.25      0.33         4
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.12      0.04      0.06        27
           4       0.68      0.94      0.79        84

    accuracy                           0.64       127
   macro avg       0.26      0.25      0.24       127
weighted avg       0.49      0.64      0.54       127



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Found input variables with inconsistent numbers of samples: [127, 127, 5]

In [None]:
# create Random Forest
clf_2 = RandomForestClassifier(
    n_estimators=1000, max_depth=None,
    min_samples_split=20,
    class_weight='balanced'
    )

clf_2.fit(X_train, y_train)
pred_labels_2 = clf_2.predict(X_test)

# print('Acc',clf_2.score(X_test, y_test),
# '\nMR',recall_score(y_test, pred_labels_2, average='macro'),
# '\nMP', precision_score(y_test, pred_labels_2, average='macro'),
# '\nMF1', f1_score(y_test, pred_labels_2,average='macro'),
# '\nMAE', mean_absolute_error(y_test, pred_labels_2),
# '\nMSE', mean_squared_error(y_test, pred_labels_2)
# )

print('Acc',' MR',' MP',' MF1',' MF1',' MAE',' MSE\n', 
    clf_2.score(X_test, y_test),
    recall_score(y_test, pred_labels_2, average='macro'),
    precision_score(y_test, pred_labels_2, average='macro'),
    f1_score(y_test, pred_labels_2,average='macro'),
    mean_absolute_error(y_test, pred_labels_2),
    mean_squared_error(y_test, pred_labels_2)
)

print(classification_report(y_test, pred_labels_2))

print(confusion_matrix(y_test, pred_labels_2, sample_weight=compute_sample_weight('balanced', y_test)))


Acc  MR  MP  MF1  MF1  MAE  MSE
 0.6299212598425197 0.23809523809523808 0.20405797101449274 0.21593682699210343 0.5511811023622047 1.0551181102362204
              precision    recall  f1-score   support

           0       0.33      0.25      0.29         4
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        27
           4       0.69      0.94      0.79        84

    accuracy                           0.63       127
   macro avg       0.20      0.24      0.22       127
weighted avg       0.46      0.63      0.53       127

[[ 1  0  0  1  2]
 [ 0  0  0  2  0]
 [ 0  0  0  1  9]
 [ 2  0  0  0 25]
 [ 0  0  0  5 79]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# create Random Forest
clf_3 = RandomForestClassifier(
    n_estimators=10000, max_depth=5000,
    min_samples_split=10,
    class_weight='balanced'
    )

clf_3.fit(X_train, y_train)
pred_labels_3 = clf_3.predict(X_test)

# print('Acc',clf_3.score(X_test, y_test),
# '\nMR',recall_score(y_test, pred_labels_3, average='macro'),
# '\nMP', precision_score(y_test, pred_labels_3, average='macro'),
# '\nMF1', f1_score(y_test, pred_labels_3,average='macro'),
# '\nMAE', mean_absolute_error(y_test, pred_labels_3),
# '\nMSE', mean_squared_error(y_test, pred_labels_3)
# )
print('Acc',' MR',' MP',' MF1',' MF1',' MAE',' MSE\n', 
    clf_3.score(X_test, y_test),
    recall_score(y_test, pred_labels_3, average='macro'),
    precision_score(y_test, pred_labels_3, average='macro'),
    f1_score(y_test, pred_labels_3,average='macro'),
    mean_absolute_error(y_test, pred_labels_3),
    mean_squared_error(y_test, pred_labels_3)
)
print(classification_report(y_test, pred_labels_3))
print(confusion_matrix(y_test, pred_labels_3, sample_weight=compute_sample_weight('balanced', y_test)))


Acc  MR  MP  MF1  MF1  MAE  MSE
 0.6377952755905512 0.24047619047619045 0.20112044817927172 0.2147783251231527 0.5669291338582677 1.1811023622047243
              precision    recall  f1-score   support

           0       0.33      0.25      0.29         4
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        27
           4       0.67      0.95      0.79        84

    accuracy                           0.64       127
   macro avg       0.20      0.24      0.21       127
weighted avg       0.46      0.64      0.53       127

[[ 1  0  0  0  3]
 [ 0  0  0  0  2]
 [ 0  0  0  1  9]
 [ 2  0  0  0 25]
 [ 0  0  0  4 80]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# create Random Forest
clf_4 = RandomForestClassifier(
    n_estimators=100, max_depth=None,
    min_samples_split=5,
    class_weight='balanced'
    )

clf_4.fit(X_train, y_train)
pred_labels_4 = clf_4.predict(X_test)

# print('Acc',clf_4.score(X_test, y_test),
# '\nMR',recall_score(y_test, pred_labels_4, average='macro'),
# '\nMP', precision_score(y_test, pred_labels_4, average='macro'),
# '\nMF1', f1_score(y_test, pred_labels_4,average='macro'),
# '\nMAE', mean_absolute_error(y_test, pred_labels_4),
# '\nMSE', mean_squared_error(y_test, pred_labels_4)
# )
print('Acc',' MR',' MP',' MF1',' MF1',' MAE',' MSE\n', 
    clf_4.score(X_test, y_test),
    recall_score(y_test, pred_labels_4, average='macro'),
    precision_score(y_test, pred_labels_4, average='macro'),
    f1_score(y_test, pred_labels_4,average='macro'),
    mean_absolute_error(y_test, pred_labels_4),
    mean_squared_error(y_test, pred_labels_4)
)

print(classification_report(y_test, pred_labels_4))
# print(confusion_matrix(y_test, pred_labels_4)* test_weights)
print(confusion_matrix(y_test, pred_labels_4, sample_weight=compute_sample_weight('balanced', y_test)))



Acc  MR  MP  MF1  MF1  MAE  MSE
 0.6377952755905512 0.19788359788359786 0.16175213675213676 0.17063255152807394 0.5590551181102362 1.110236220472441
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.12      0.04      0.06        27
           4       0.68      0.95      0.80        84

    accuracy                           0.64       127
   macro avg       0.16      0.20      0.17       127
weighted avg       0.48      0.64      0.54       127

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 4.25196850e-01
  1.32283465e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.12598425e-01
  6.61417323e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.12598425e-01
  5.95275591e+00]
 [3.14960630e-02 0.00000000e+00 0.00000000e+00 2.12598425e-01
  1.65354331e+01]
 [0.00000000e+00 0.00000000e+00 7.87401575

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
