# This notebook goes through process of training Random Forest model and displays model metrics

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import joblib
import sys

# Unit test to determine if file has all data necessary to run Random Forest model

In [2]:
def csv_test(df):
    '''Test to ensure all environmental data is in csv file and none is missing'''
    if df.isnull().sum().all() == 0:
        print('✅ Data is correctly read in and ready for training!')
    else:
        sys.exit('❌ Error. One or more values are missing. Please check csv file to remove any dates with incomplete environmental values.')

# Read in environmental and MD data

In [3]:
# Read in dataframe
df = pd.read_csv("central_ok_mds_env_FINAL.csv")

# Complete unit test
csv_test(df)

✅ Data is correctly read in and ready for training!


# Split data into training, validation, and testing

In [4]:
# Split data
train_data, val_test_data = train_test_split(df, test_size = 0.3, random_state = 988)
val_data, test_data = train_test_split(val_test_data, test_size = 0.6667, random_state = 988)

# Train base Random Forest model

In [5]:
# Set feature list of MD and RAP analysis data
feature_list = ['month', 'time', 'pwat', 'mslp', 'cape', 'cin', 't2m', 'rh2m', 'q2m', 'u10', 'v10', 'uv10', 'lcl', 'shr0_6', 'srh0_1', 'srh0_3']

In [6]:
# Train base Random Forest model
rf = RandomForestClassifier(random_state = 56)
rf.fit(train_data[feature_list].values, train_data.label.values)

# Test (using validation data) base Random Forest model
predicted = rf.predict(val_data[feature_list].values)
expected = val_data.label.values

# Print classification report
print('Classification report for classifier %s:\n%s\n'
      % (rf, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(val_data[feature_list].values.shape[1]):
    print('%d. (%s) feature %d (%f)' % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf, 'random_forest_base.joblib')

Classification report for classifier RandomForestClassifier(random_state=56):
              precision    recall  f1-score   support

           0       0.72      0.79      0.75        48
           1       0.77      0.69      0.73        49

    accuracy                           0.74        97
   macro avg       0.74      0.74      0.74        97
weighted avg       0.75      0.74      0.74        97


Brier score: 0.25773195876288657

1. (time) feature 1 (0.155708)
2. (pwat) feature 2 (0.073518)
3. (srh0_3) feature 15 (0.068933)
4. (shr0_6) feature 13 (0.066249)
5. (cin) feature 5 (0.062023)
6. (mslp) feature 3 (0.059004)
7. (srh0_1) feature 14 (0.058645)
8. (lcl) feature 12 (0.057585)
9. (rh2m) feature 7 (0.055991)
10. (t2m) feature 6 (0.055883)
11. (q2m) feature 8 (0.054606)
12. (u10) feature 9 (0.054566)
13. (uv10) feature 11 (0.051042)
14. (cape) feature 4 (0.049007)
15. (v10) feature 10 (0.048725)
16. (month) feature 0 (0.028514)


# Optimized Random Forest model

In [7]:
# Create parameterizations to test on training data for optimization
param_grid = [{
    'n_estimators': [20, 40, 60, 80, 100, 150, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20] }]

# Fit optizimation to training data
grid_search = GridSearchCV(rf, param_grid, cv = 2, scoring = 'accuracy', n_jobs = -1)

grid_search.fit(train_data[feature_list].values, train_data.label.values)
grid_search.best_params_

4704 fits failed out of a total of 47040.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4704 fits failed with the following error:
Traceback (most recent call last):
  File "/anaconda3/envs/pyEAE/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda3/envs/pyEAE/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
    trees = Parallel(
  File "/anaconda3/envs/pyEAE/lib/python3.9/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/anaconda3/envs/pyEAE/lib/python3.9/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(t

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 150}

In [8]:
# Train optimized Random Forest model
rf_opt = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 700)
rf_opt.fit(train_data[feature_list].values, train_data.label.values)

# Test (using validation data) optimized Random Forest model
predicted = rf_opt.predict(val_data[feature_list].values)
expected = val_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_opt, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_opt.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(val_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file
# joblib.dump(rf_opt, "random_forest_opt.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=700):
              precision    recall  f1-score   support

           0       0.76      0.79      0.78        48
           1       0.79      0.76      0.77        49

    accuracy                           0.77        97
   macro avg       0.77      0.77      0.77        97
weighted avg       0.77      0.77      0.77        97


Brier score: 0.2268041237113402

1. (time) feature 1 (0.214611)
2. (shr0_6) feature 13 (0.075268)
3. (pwat) feature 2 (0.072107)
4. (srh0_3) feature 15 (0.070033)
5. (cin) feature 5 (0.067436)
6. (mslp) feature 3 (0.054553)
7. (rh2m) feature 7 (0.051457)
8. (u10) feature 9 (0.048996)
9. (srh0_1) feature 14 (0.048635)
10. (q2m) feature 8 (0.046989)
11. (lcl) feature 12 (0.046830)
12. (t2m) feature 6 (0.045960)
13. (v10) feature 10 (0.045799)
14. (cape) feature 4 (0.045410)

# Testing and perturbing opitmized Random Forest model

In [9]:
# Train perturbed final Random Forest model
rf_final0 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 136)
rf_final0.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final0.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final0, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

importances = rf_final0.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))

# Save Random Forest model as joblib file
# joblib.dump(rf_final0, "random_forest_final0.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=136):
              precision    recall  f1-score   support

           0       0.77      0.69      0.72       105
           1       0.68      0.76      0.72        91

    accuracy                           0.72       196
   macro avg       0.72      0.72      0.72       196
weighted avg       0.72      0.72      0.72       196


Brier score: 0.28061224489795916

1. (time) feature 1 (0.195804)
2. (srh0_3) feature 15 (0.082093)
3. (pwat) feature 2 (0.077716)
4. (shr0_6) feature 13 (0.074642)
5. (cin) feature 5 (0.063688)
6. (mslp) feature 3 (0.056502)
7. (rh2m) feature 7 (0.055640)
8. (lcl) feature 12 (0.051827)
9. (v10) feature 10 (0.051563)
10. (cape) feature 4 (0.050679)
11. (u10) feature 9 (0.047859)
12. (t2m) feature 6 (0.045712)
13. (srh0_1) feature 14 (0.043964)
14. (q2m) feature 8 (0.043452

In [10]:
# Train perturbed final Random Forest model
rf_final1 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 2)
rf_final1.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final1.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final1, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final1.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))

# Save Random Forest model as joblib file
# joblib.dump(rf_final1, "random_forest_final1.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=2):
              precision    recall  f1-score   support

           0       0.71      0.65      0.68       105
           1       0.63      0.69      0.66        91

    accuracy                           0.67       196
   macro avg       0.67      0.67      0.67       196
weighted avg       0.67      0.67      0.67       196


Brier score: 0.33163265306122447

1. (time) feature 1 (0.205736)
2. (pwat) feature 2 (0.081016)
3. (srh0_3) feature 15 (0.078090)
4. (shr0_6) feature 13 (0.067934)
5. (srh0_1) feature 14 (0.066911)
6. (q2m) feature 8 (0.055840)
7. (rh2m) feature 7 (0.054702)
8. (mslp) feature 3 (0.054327)
9. (lcl) feature 12 (0.049814)
10. (u10) feature 9 (0.048007)
11. (cape) feature 4 (0.047483)
12. (t2m) feature 6 (0.044772)
13. (cin) feature 5 (0.044702)
14. (v10) feature 10 (0.042641)


In [11]:
# Train perturbed final Random Forest model
rf_final2 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 439)
rf_final2.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final2.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final2, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final2.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final2, "random_forest_final2.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=439):
              precision    recall  f1-score   support

           0       0.75      0.67      0.71       105
           1       0.66      0.75      0.70        91

    accuracy                           0.70       196
   macro avg       0.71      0.71      0.70       196
weighted avg       0.71      0.70      0.70       196


Brier score: 0.29591836734693877

1. (time) feature 1 (0.185100)
2. (shr0_6) feature 13 (0.078559)
3. (srh0_3) feature 15 (0.078368)
4. (pwat) feature 2 (0.069783)
5. (cin) feature 5 (0.066507)
6. (rh2m) feature 7 (0.061962)
7. (srh0_1) feature 14 (0.058714)
8. (mslp) feature 3 (0.054137)
9. (t2m) feature 6 (0.052497)
10. (q2m) feature 8 (0.051303)
11. (v10) feature 10 (0.050193)
12. (u10) feature 9 (0.046541)
13. (uv10) feature 11 (0.045853)
14. (lcl) feature 12 (0.04199

In [12]:
# Train perturbed final Random Forest model
rf_final3 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 712)
rf_final3.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final3.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final3, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final3.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final3, "random_forest_final3.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=712):
              precision    recall  f1-score   support

           0       0.71      0.66      0.68       105
           1       0.64      0.69      0.66        91

    accuracy                           0.67       196
   macro avg       0.67      0.67      0.67       196
weighted avg       0.68      0.67      0.67       196


Brier score: 0.32653061224489793

1. (time) feature 1 (0.177671)
2. (shr0_6) feature 13 (0.076150)
3. (pwat) feature 2 (0.073795)
4. (srh0_3) feature 15 (0.070539)
5. (cin) feature 5 (0.064363)
6. (srh0_1) feature 14 (0.062376)
7. (mslp) feature 3 (0.058916)
8. (rh2m) feature 7 (0.057859)
9. (lcl) feature 12 (0.055567)
10. (q2m) feature 8 (0.053701)
11. (cape) feature 4 (0.049548)
12. (v10) feature 10 (0.046250)
13. (uv10) feature 11 (0.045149)
14. (u10) feature 9 (0.0449

In [13]:
# Train perturbed final Random Forest model
rf_final4 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 297)
rf_final4.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final4.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final4, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final4.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final4, "random_forest_final4.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=297):
              precision    recall  f1-score   support

           0       0.76      0.67      0.71       105
           1       0.66      0.76      0.71        91

    accuracy                           0.71       196
   macro avg       0.71      0.71      0.71       196
weighted avg       0.72      0.71      0.71       196


Brier score: 0.29081632653061223

1. (time) feature 1 (0.190131)
2. (srh0_3) feature 15 (0.096217)
3. (pwat) feature 2 (0.076019)
4. (shr0_6) feature 13 (0.068551)
5. (q2m) feature 8 (0.059210)
6. (rh2m) feature 7 (0.057895)
7. (cin) feature 5 (0.052489)
8. (srh0_1) feature 14 (0.052375)
9. (u10) feature 9 (0.052164)
10. (cape) feature 4 (0.051808)
11. (t2m) feature 6 (0.047685)
12. (lcl) feature 12 (0.045393)
13. (v10) feature 10 (0.044606)
14. (mslp) feature 3 (0.041311

In [14]:
# Train perturbed final Random Forest model
rf_final5 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 331)
rf_final5.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final5.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final5, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final5.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final5, "random_forest_final5.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=331):
              precision    recall  f1-score   support

           0       0.72      0.65      0.68       105
           1       0.64      0.71      0.67        91

    accuracy                           0.68       196
   macro avg       0.68      0.68      0.68       196
weighted avg       0.68      0.68      0.68       196


Brier score: 0.32142857142857145

1. (time) feature 1 (0.183461)
2. (srh0_3) feature 15 (0.082607)
3. (pwat) feature 2 (0.074283)
4. (rh2m) feature 7 (0.063791)
5. (cin) feature 5 (0.060996)
6. (cape) feature 4 (0.057963)
7. (srh0_1) feature 14 (0.056448)
8. (shr0_6) feature 13 (0.054206)
9. (mslp) feature 3 (0.052167)
10. (u10) feature 9 (0.051692)
11. (q2m) feature 8 (0.050714)
12. (lcl) feature 12 (0.049677)
13. (uv10) feature 11 (0.048357)
14. (v10) feature 10 (0.0474

In [15]:
# Train perturbed final Random Forest model
rf_final6 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 984)
rf_final6.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final6.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final6, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final6.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final6, "random_forest_final6.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=984):
              precision    recall  f1-score   support

           0       0.71      0.70      0.71       105
           1       0.66      0.67      0.67        91

    accuracy                           0.69       196
   macro avg       0.69      0.69      0.69       196
weighted avg       0.69      0.69      0.69       196


Brier score: 0.3112244897959184

1. (time) feature 1 (0.187416)
2. (srh0_3) feature 15 (0.076926)
3. (pwat) feature 2 (0.076831)
4. (shr0_6) feature 13 (0.062339)
5. (t2m) feature 6 (0.061379)
6. (cin) feature 5 (0.057303)
7. (mslp) feature 3 (0.054607)
8. (lcl) feature 12 (0.054082)
9. (q2m) feature 8 (0.053586)
10. (cape) feature 4 (0.052116)
11. (rh2m) feature 7 (0.050641)
12. (u10) feature 9 (0.049978)
13. (srh0_1) feature 14 (0.048374)
14. (v10) feature 10 (0.046912)

In [16]:
# Train perturbed final Random Forest model
rf_final7 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 813)
rf_final7.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final7.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final7, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final7.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final7, "random_forest_final7.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=813):
              precision    recall  f1-score   support

           0       0.73      0.66      0.69       105
           1       0.64      0.71      0.68        91

    accuracy                           0.68       196
   macro avg       0.68      0.69      0.68       196
weighted avg       0.69      0.68      0.68       196


Brier score: 0.3163265306122449

1. (time) feature 1 (0.179408)
2. (srh0_3) feature 15 (0.083320)
3. (pwat) feature 2 (0.075837)
4. (shr0_6) feature 13 (0.073692)
5. (srh0_1) feature 14 (0.070514)
6. (cin) feature 5 (0.060522)
7. (q2m) feature 8 (0.053300)
8. (u10) feature 9 (0.052594)
9. (lcl) feature 12 (0.051187)
10. (t2m) feature 6 (0.051182)
11. (v10) feature 10 (0.047428)
12. (uv10) feature 11 (0.047110)
13. (mslp) feature 3 (0.046742)
14. (rh2m) feature 7 (0.043757

In [17]:
# Train perturbed final Random Forest model
rf_final8 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 47)
rf_final8.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final8.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final8, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final8.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final8, "random_forest_final8.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=47):
              precision    recall  f1-score   support

           0       0.74      0.61      0.67       105
           1       0.62      0.75      0.68        91

    accuracy                           0.67       196
   macro avg       0.68      0.68      0.67       196
weighted avg       0.68      0.67      0.67       196


Brier score: 0.32653061224489793

1. (time) feature 1 (0.201208)
2. (shr0_6) feature 13 (0.079029)
3. (srh0_3) feature 15 (0.078900)
4. (pwat) feature 2 (0.069183)
5. (cin) feature 5 (0.063931)
6. (lcl) feature 12 (0.062670)
7. (srh0_1) feature 14 (0.055663)
8. (rh2m) feature 7 (0.051547)
9. (q2m) feature 8 (0.050994)
10. (v10) feature 10 (0.045971)
11. (t2m) feature 6 (0.045806)
12. (mslp) feature 3 (0.044868)
13. (cape) feature 4 (0.043263)
14. (uv10) feature 11 (0.04162

In [18]:
# Train perturbed final Random Forest model
rf_final9 = RandomForestClassifier(criterion = 'log_loss', max_depth = 16, min_samples_leaf = 6, min_samples_split = 4, n_estimators = 40, random_state = 454)
rf_final9.fit(train_data[feature_list].values, train_data.label.values)

# Test perturbed final Random Forest model
predicted = rf_final9.predict(test_data[feature_list].values)
expected = test_data.label.values

# Print classification report
print("Classification report for classifier %s:\n%s\n"
      % (rf_final9, metrics.classification_report(expected, predicted)))

# Print brier score
print(f'Brier score: {metrics.brier_score_loss(expected, predicted)}', end = '\n\n')

# Print feature importance precentages
importances = rf_final9.feature_importances_

indices = np.argsort(importances)[::-1]

for i in range(test_data[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (i+1, feature_list[indices[i]], indices[i], importances[indices[i]]))
    
# Save Random Forest model as joblib file    
# joblib.dump(rf_final9, "random_forest_final9.joblib")

Classification report for classifier RandomForestClassifier(criterion='log_loss', max_depth=16, min_samples_leaf=6,
                       min_samples_split=4, n_estimators=40, random_state=454):
              precision    recall  f1-score   support

           0       0.74      0.70      0.72       105
           1       0.67      0.73      0.70        91

    accuracy                           0.71       196
   macro avg       0.71      0.71      0.71       196
weighted avg       0.71      0.71      0.71       196


Brier score: 0.29081632653061223

1. (time) feature 1 (0.194181)
2. (srh0_3) feature 15 (0.070899)
3. (pwat) feature 2 (0.070563)
4. (cin) feature 5 (0.065974)
5. (shr0_6) feature 13 (0.064757)
6. (rh2m) feature 7 (0.061923)
7. (lcl) feature 12 (0.059431)
8. (v10) feature 10 (0.058531)
9. (mslp) feature 3 (0.055844)
10. (srh0_1) feature 14 (0.049973)
11. (cape) feature 4 (0.048866)
12. (t2m) feature 6 (0.045192)
13. (u10) feature 9 (0.045189)
14. (uv10) feature 11 (0.0440