In [1]:
#Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn import svm
from scipy.stats import norm
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from typing import Union
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from IPython.display import display
from time import time
import matplotlib.widgets
from matplotlib.widgets import RadioButtons, CheckButtons
%matplotlib nbagg 
import matplotlib.animation 
from sklearn.inspection import permutation_importance
import random
import pickle
import ipywidgets as widgets

In [2]:
# load models and data needed from last notebook 
forest = pickle.load(open('forest_model.sav', 'rb'))
X_test = pd.read_csv('X_test.csv', index_col=0).to_numpy()
y_test = pd.read_csv('y_test.csv', index_col=0).to_numpy().flatten()
X_train = pd.read_csv('X_train.csv', index_col=0).to_numpy()
y_train = pd.read_csv('y_train.csv', index_col=0).to_numpy().flatten()
forest_y_pred = pd.read_csv('forest_y_pred.csv', index_col=0).to_numpy().flatten()
knn_y_pred = pd.read_csv('knn_y_pred.csv', index_col=0).to_numpy().flatten()
X = pd.read_csv('X.csv', index_col=0).to_numpy()
y = pd.read_csv('y.csv', index_col=0).to_numpy().flatten()

# Analysis of predictivity of the classifier 

## 1. Classification report 
With the scores obtained, the Random Forest and KNN classifiers were chosen to be further investigated in the next stage with the classification report. The classification accuracy, precision, recall and f1-scores from the reports of both were compared and the Random Forest classifier had the highest scoring metrics. Therefore, the Random Forest classifier was chosen as our best classifier to proceed with. 

In [3]:
#Classification report 
target_names = ['cubic', 'tetragonal', 'orthorhombic','rhombohedral']
knn_report = metrics.classification_report( y_test, knn_y_pred, target_names=target_names, zero_division = 0) 
forest_report = metrics.classification_report( y_test, forest_y_pred, target_names=target_names, zero_division = 0)
print(f"Random Forest classification report")
print(forest_report)
print(f"KNN classification report")
print(knn_report)

Random Forest classification report
              precision    recall  f1-score   support

       cubic       0.84      0.91      0.87       710
  tetragonal       0.70      0.71      0.70       333
orthorhombic       0.57      0.23      0.32        71
rhombohedral       0.70      0.22      0.33        32

    accuracy                           0.79      1146
   macro avg       0.70      0.52      0.56      1146
weighted avg       0.78      0.79      0.77      1146

KNN classification report
              precision    recall  f1-score   support

       cubic       0.80      0.91      0.85       710
  tetragonal       0.68      0.64      0.66       333
orthorhombic       0.38      0.13      0.19        71
rhombohedral       0.33      0.03      0.06        32

    accuracy                           0.76      1146
   macro avg       0.55      0.43      0.44      1146
weighted avg       0.73      0.76      0.73      1146



## 2. Cross validation 

Next, cross validation was performed on the Random Forest classifier and different hyperparameters were optimised, namely: the number of samples, number of features, number of trees, and tree depth. Regarding the number of samples, a smaller sample size will make trees more different and vice versa. The sizes are set to floats between 0 and 1 which is a percentage of the size of the training dataset to make the sample used to train each decision tree. The number of features that is randomly sampled for each split point is also a very important hyperparameter and therefore was optimised as well. The number of trees is another key hyperparameter as increasing the numbers of trees will increase the model performance until at one point it stabilizes. Lastly, different tree depths were analysed as the maximum depth of the decision trees could also affect the model. 

The ranges tested were limited as otherwise the code would take a very long time to run and also as it increases, they start to make less difference to the performance. 

In [29]:
#Cross validation for random forest classifier 

def get_sample_numbers():
    models = dict()
    for s in np.arange(0.1, 1.1, 0.2):
        key = '%.1f' % s
        if s == 1.0:
            s = None
        models[key] = RandomForestClassifier(max_samples=s)
    return models

def get_number_of_features():
    models = dict()
    for i in range(1,6):
        models[str(i)] = RandomForestClassifier(max_features=i)
    return models

def get_number_of_trees():
    n_trees = [10, 50, 100, 200]
    models = dict()
    for n in n_trees:
        models[str(n)] = RandomForestClassifier(n_estimators=n)
    return models

def get_tree_depth():
    depths = [d for d in range(1,6)] + [None]
    models = dict()
    for d in depths:
        models[str(d)] = RandomForestClassifier(max_depth=d)
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X_test, y_test):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# evaluate the models and store results
def get_scores(models, column_name):
    results, names, mean, std = list(), list(), list(), list()
    for name, model in models.items():
        scores = evaluate_model(model, X_test, y_test)
        results.append(scores)
        names.append(name)
        mean.append(np.mean(scores))
        std.append(np.std(scores))
    data = {'Name' : names,
           'Mean' : mean,
           'Standard deviation' : std}
    results = pd.DataFrame(data).rename(columns={"Name": column_name})
    return results 
        
# Get the different hyperparameters to evaluate
sample_sizes = get_sample_numbers()
sample_size_data = get_scores(sample_sizes, 'Sample sizes')
display(sample_size_data)

number_of_features = get_number_of_features()
num_of_features_data = get_scores(number_of_features, 'Number of features')
display(num_of_features_data)

number_of_trees = get_number_of_trees()
num_of_trees_data = get_scores(number_of_trees, 'Number of trees')
display(num_of_trees_data)

tree_depth = get_tree_depth()
tree_depth_data = get_scores(tree_depth, 'Tree depth')
display(tree_depth_data)

Unnamed: 0,Sample sizes,Mean,Standard deviation
0,0.1,0.755058,0.010909
1,0.3,0.780009,0.012042
2,0.5,0.784957,0.011118
3,0.7,0.786628,0.012248
4,0.9,0.785757,0.013267


Unnamed: 0,Number of features,Mean,Standard deviation
0,1,0.777898,0.009288
1,2,0.781174,0.013441
2,3,0.78532,0.012181
3,4,0.785829,0.014672
4,5,0.785244,0.014745


Unnamed: 0,Number of trees,Mean,Standard deviation
0,10,0.769606,0.01352
1,50,0.78292,0.012508
2,100,0.783137,0.014152
3,200,0.785027,0.014436


Unnamed: 0,Tree depth,Mean,Standard deviation
0,1.0,0.624402,0.008835
1,2.0,0.671835,0.014671
2,3.0,0.703986,0.014884
3,4.0,0.718536,0.012519
4,5.0,0.729232,0.012098
5,,0.782411,0.012639


## Summary of cross validation 

From the results of the cross validation, it was determined that the best hyperparameters to use (using the mean scores) for the Random Forest classifier were:

Number of samples = 0.7

Number of features = 5

Number of trees = 200

Tree depth = None meaning nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

The model was then retrained below the give the model with optimised hyperparameters. 

In [4]:
# Export new optimised model and data, to be used in next notebook 
forest = RandomForestClassifier(max_samples=0.7,  max_features=5, n_estimators=200).fit(X_train, y_train)
forest_y_pred = forest.predict(X_test)
forest_filename = 'optimised_forest_model.sav'
pickle.dump(forest, open(forest_filename, 'wb'))
pd.DataFrame(forest_y_pred).to_csv("optimised_forest_y_pred.csv")