# Notebook 1 - Preprocessing, training and testing #


 This initial section of the code imports all the relevant libraries and packages that would be needed for pre-processing, training and testing data. 

In [1]:
#Uncomment the line above in case the seaborn package is not installed on your system
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, cross_val_predict, GridSearchCV
import matplotlib.widgets
from sklearn.ensemble import RandomForestClassifier


data = pd.read_csv("Crystal_structure.csv") # Loads data into a variable to be manipulated.
print("This is the unprocessed data")
print(data)

This is the unprocessed data
     Compound   A   B  In literature v(A) v(B)  r(AXII)(Å)  r(AVI)(Å)  \
0       Ac2O3  Ac  Ac          False    0    0        1.12       1.12   
1      AcAgO3  Ac  Ag          False    0    0        1.12       1.12   
2      AcAlO3  Ac  Al          False    0    0        1.12       1.12   
3      AcAsO3  Ac  As          False    0    0        1.12       1.12   
4      AcAuO3  Ac  Au          False    0    0        1.12       1.12   
...       ...  ..  ..            ...  ...  ...         ...        ...   
5324    ZrWO3  Zr   W          False    1    5        0.89       0.72   
5325    ZrYO3  Zr   Y          False    -    -        0.89       0.72   
5326   ZrYbO3  Zr  Yb          False    -    -        0.89       0.72   
5327   ZrZnO3  Zr  Zn          False    -    -        0.89       0.72   
5328    Zr2O3  Zr  Zr          False    -    -        0.89       0.72   

      r(BVI)(Å)  EN(A)  EN(B)  l(A-O)(Å)  l(B-O)(Å)      ΔENR        tG  \
0          1.12   1

# Preprocessing the data from the CSV file #
The data from the CSV file needed to be cleaned up before it could be subjected to ML algorithms.
The following operations were performed on the data:
- Rows with no values for lowest distortion or τ were dropped
- The following rows were removed because they were not important for determining the crystal structure
    1. "In literature"
    2. "Compound"
    3. "A"
    4. "B"
    5. "V(B)"
    6. "r(BVI)(Å)"
- Values of "t<sub>G</sub> " less than 0.82 were dropped since these values are non-consistent with a perovskite structure
- Values of "τ" greater than 4.18 were dropped since these values are non-consistent with a perovskite structure
- One hot encoder was used 
- Values of "V(A)" = 4 and "V(A)" = 5 were dropped because the A cation cannot take these values to form a perovskite structure


In [2]:
def PreProcessing(dataTemp: pd) -> pd: # Applies all Pre Processing steps to the dataframe.
    
    dataCleaned = dataTemp.drop(dataTemp.loc[dataTemp["Lowest distortion"] == "-"].index) # Removes rows with no classifer
    dataCleaned.drop(dataCleaned.loc[dataCleaned["τ"] == "-"].index, inplace = True) # Removes rows with no τ
    dataCleaned.drop(["In literature", "Compound", "A", "B", "v(B)", "r(BVI)(Å)"], axis = 1, inplace = True) # Removes rows which are not important
    dataCleaned["τ"] = dataCleaned["τ"].astype(float)
    dataCleaned.drop(dataCleaned[dataCleaned['tG'] < 0.82].index, inplace = True) # Removes rows which have a "tG" value less than 0.82
    dataCleaned.drop(dataCleaned[dataCleaned['τ'] > 4.18].index, inplace = True) # Removes rows which have a "τ" value greater than 4.18
    dataCleaned.reset_index(drop = True, inplace = True)

    mms = MinMaxScaler()
    dataCleaned[['v(A)',"r(AXII)(Å)","r(AVI)(Å)","EN(A)","EN(B)","l(A-O)(Å)","l(B-O)(Å)","ΔENR", "tG","τ","μ"]] = mms.fit_transform(dataCleaned[['v(A)',"r(AXII)(Å)","r(AVI)(Å)","EN(A)","EN(B)","l(A-O)(Å)","l(B-O)(Å)","ΔENR", "tG","τ","μ"]])
    # Data is normalised with a Min-Max Scaler, such that all values are in the range 0-1
    
    return dataCleaned

In [3]:
dataPro = PreProcessing(data) 
print("This is the pre-processed data")
print(dataPro)
# This is our PreProcessed data

This is the pre-processed data
     v(A)  r(AXII)(Å)  r(AVI)(Å)     EN(A)     EN(B)  l(A-O)(Å)  l(B-O)(Å)  \
0    0.00    0.538462   0.345455  0.740260  0.730159   0.464755   0.684265   
1    0.00    0.538462   0.345455  0.740260  0.841270   0.464755   0.190622   
2    0.00    0.538462   0.345455  0.740260  0.396825   0.464755   0.382537   
3    0.00    0.538462   0.345455  0.740260  0.753968   0.464755   0.522394   
4    0.00    0.538462   0.345455  0.740260  0.317460   0.464755   0.384566   
..    ...         ...        ...       ...       ...        ...        ...   
604  0.50    0.353846   0.345455  0.201299  0.634921   0.224127   0.183745   
605  0.50    0.238462   0.136364  0.350649  0.404762   0.089163   0.310565   
606  0.75    0.238462   0.136364  0.350649  0.746032   0.089163   0.000000   
607  0.75    0.238462   0.136364  0.350649  0.373016   0.089163   0.206888   
608  0.50    0.238462   0.136364  0.350649  0.634921   0.089163   0.441028   

         ΔENR        tG         

In [4]:
class TrainTestSplit(): # Performs the Test-Training Split
    def __init__(self, d: pd):
        x_col = d.drop('Lowest distortion', axis=1) # Identifies the "Features/Input" columns
        y_col = d['Lowest distortion'] # Identifies the "Classifer/Output" column

        self.x_col = x_col.to_numpy() # Converts the input and classifer columns into numpy arrays such that they can be used for the cross-validation
        self.y_col = y_col.to_numpy()

        x_train, x_test, y_train, y_test = train_test_split(x_col, y_col, test_size=0.4, train_size = 0.6) # Splits data into training and testing.
        
        self.x_train = x_train.to_numpy() # Converts the training and testing data into numpy arrays 
        self.x_test = x_test.to_numpy()
        self.y_train = y_train.to_numpy()
        self.y_test = y_test.to_numpy()

split = TrainTestSplit(d = dataPro) # Creates an instance of the Test-Training split

In [5]:
#pair plots to see correlation between varaibles 
#sns.pairplot(dataPro, hue = 'Lowest distortion')
#plt.show()
# A 13x13 grid of scatter plots
# Allows us to see all the different permutations for the features against eachother and their relations

# For all our 13 features, each feature is plotted against eachother for all permutations, and displayed in a 13x13
# grid of scatter plots

# Creating classes to implement KNN and SVC classification algorithms #


In [6]:

class Svc():
    def __init__(self, split: pd):
        self.stratifiedkf = StratifiedKFold(n_splits=10) # Provides indices which can be used to split the test/train data
        self.SVCclass = SVC(C = 1, degree = 1, gamma = 2, kernel = 'rbf') # Initiates the SVC classifier
        self.SVCclass.fit(split.x_train, split.y_train) # Feeds the training data into the SVC classifier
        self.y_pred = self.SVCclass.predict(split.x_test) # Uses the SVC classifier to predict outcomes from the test data
    
    def hyper(self) -> dict :
        """Optimizes the hyperparameters of the classifier"""
        #parameters to cycle through while optimizing 
        param_gri = {'C':list(range(1,5,1)),'gamma':list(range(1,10,1)), 
                     'kernel':['rbf', 'poly'], 'degree': list(range(1,4,1))}
        grid = GridSearchCV(SVC(),param_gri,refit = True, verbose=1, cv=5)
        #model must be fitted first 
        grid.fit(split.x_train, split.y_train)
        return grid.best_params_ #returns the optimized paramters out of those from the paramgrid
        
    
    def getSVCconfusion(self) -> np.ndarray: 
        """Creates a confusion matrix based on the comparison of the predicted results with the test results"""
        cfmatrix = confusion_matrix(split.y_test,self.y_pred)
        return cfmatrix

    def getSVCreport(self) -> str: 
        """Creates a report that entails the main classification metrics such as precision, recall and f-1 score and support"""
        return classification_report(split.y_test,self.y_pred)

    def getSVCcrossval(self) -> np.ndarray: 
        """ # A list of f1 scores for each fold of the training data set"""
        crossVal = cross_val_score(self.SVCclass,split.x_col,split.y_col,cv=self.stratifiedkf)
        return crossVal

    def getSVCprfs(self) -> list: 
        """Computes precision, recall, F-measure and support for each class"""
        return precision_recall_fscore_support(split.y_test, self.y_pred, average='weighted') 

class Knn(): 
    def __init__(self, split:pd , k:int = 3):
        self.stratifiedkf = StratifiedKFold(n_splits=10) # Provides indices which can be used to split the test/train data
        self.KNNclass = KNeighborsClassifier(n_neighbors= k) # Initiates the SVC classifier
        self.KNNclass.fit(split.x_train, split.y_train) # Feeds the training data into the SVC classifier
        self.y_pred = self.KNNclass.predict(split.x_test) # Uses the SVC classifier to predict outcomes from the test data
    
    def getKNNconfusion(self) -> np.ndarray: 
        """Creates a confusion matrix based on the comparison of the predicted results with the test results"""
        cfmatrix = confusion_matrix(split.y_test,self.y_pred)
        return cfmatrix

    def getKNNreport(self) -> str:
        """Creates a report that entails the main classification metrics such as precision, recall and f-1 score and support"""
        return classification_report(split.y_test,self.y_pred)

    def getKNNcrossval(self) -> np.ndarray:
        """A list of f1 scores for each fold of the training data set"""
        crossVal = cross_val_score(self.KNNclass,split.x_col,split.y_col,cv=self.stratifiedkf)
        return crossVal

    def getKNNprfs(self) -> list: 
        """Computes precision, recall, F-measure and support for each class"""
        return precision_recall_fscore_support(split.y_test, self.y_pred, average='weighted')

class Rfc():
    def __init__(self,split):
        self.RFCclassifier = RandomForestClassifier( criterion='gini', max_features= 'auto', min_samples_leaf= 3, n_estimators=11) 
        self.RFCclassifier.fit(split.x_train, split.y_train)
        self.y_pred=self.RFCclassifier.predict(split.x_test)
        self.stratifiedkf = StratifiedKFold(n_splits=10)
        
        
    def hyper(self) -> dict:
        """Optimizes the hyperparameters of the classifier"""
        param_gri = {
 'criterion': ['gini', 'entropy'],
 'max_features': ['sqrt', 'auto'],
 'min_samples_leaf': list(range(1,5,1)),
 'n_estimators': list(range(1,20,2))}
        CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_gri, cv= 5, verbose = 1)
        CV_rfc.fit(split.x_train, split.y_train)
        CV_rfc.best_params_
        return CV_rfc.best_params_
    
    def getRFCscore(self)-> str:
        """Computes F-1 score"""
        return accuracy_score(split.y_test,self.y_pred)
   
    def getRFCreport(self)-> str:
        """Creates a report that entails the main classification metrics such as precision, recall and f-1 score and support"""
        return classification_report(split.y_test,self.y_pred)
   
    def getRFCcrossval(self) -> np.ndarray :
        """A list of f1 scores for each fold of the training data set"""
        crossVal = cross_val_score(self.RFCclassifier,split.x_col,split.y_col,cv=self.stratifiedkf)
        return crossVal
    
    def getRFCconfusion(self) -> np.ndarray: 
        """Creates a confusion matrix based on the comparison of the predicted results with the test results"""
        cfmatrix = confusion_matrix(split.y_test,self.y_pred)
       
        return cfmatrix

from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest


# Create 2 empty dictionaries. Each key:value pair stores the variable name:associated confusion matrix.
confusion_matrix_knn = {}
confusion_matrix_svc = {}
confusion_matrix_rfc = {}


# Initialise the objects of svc and knn class
ex1 = Svc(split = split)
ex2 = Knn(split = split)
ex3 = Rfc(split = split)


# Adds the overall confusion matrix obtained while using all the variables to the list of dictionaries
confusion_matrix_knn["Overall"] = ex2.getKNNconfusion().tolist()
confusion_matrix_svc["Overall"] = ex1.getSVCconfusion().tolist()
confusion_matrix_rfc["Overall"] = ex3.getRFCconfusion().tolist()



print("SVC:")
print(f"Average of cross-validation:{np.average(ex1.getSVCcrossval())}")

print("\nKNN:")
print(f"Average of cross-validation: {np.average(ex2.getKNNcrossval())}")

print("\nRFC:")
print(f"Average of cross-validation: {np.average(ex3.getRFCcrossval())}")

SVC:
Average of cross-validation:0.7062568306010929

KNN:
Average of cross-validation: 0.6751092896174863

RFC:
Average of cross-validation: 0.7077595628415301


# Finding best hyper-parameters #
Loops through hyperparameters for KNN and SCV to find optimal hyperparameters. These hyperparameters were used in the cell above.

In [7]:
# Looping through k values to see which k gives best accuracy for kNN
# Note that weighted averages were compared
#This code takes around 1 minute to run due to the large number of permutations
k_values = list(range(1, 6))
for i in k_values:
    model1 = Knn(split=split, k = i)
    print(f"f1_score for k: {i}")
    print(model1.getKNNreport())
    print(type(model1.getKNNreport()))
    
# Using GridSearchCV() to find best hyperparameters for SCV and RFC
print(f"hyperparameter for SCV:{ex1.hyper()}")
print(f"hyperparameter for RFC: {ex3.hyper()}")

f1_score for k: 1
              precision    recall  f1-score   support

       cubic       0.66      0.65      0.65        80
orthorhombic       0.77      0.77      0.77       140
rhombohedral       0.18      0.30      0.22        10
  tetragonal       0.38      0.21      0.27        14

    accuracy                           0.68       244
   macro avg       0.50      0.48      0.48       244
weighted avg       0.69      0.68      0.68       244

<class 'str'>
f1_score for k: 2
              precision    recall  f1-score   support

       cubic       0.59      0.82      0.69        80
orthorhombic       0.80      0.74      0.77       140
rhombohedral       0.00      0.00      0.00        10
  tetragonal       0.00      0.00      0.00        14

    accuracy                           0.69       244
   macro avg       0.35      0.39      0.36       244
weighted avg       0.65      0.69      0.66       244

<class 'str'>
f1_score for k: 3
              precision    recall  f1-score   su

  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1080 out of 1080 | elapsed:   42.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


hyperparameter for SCV:{'C': 2, 'degree': 1, 'gamma': 8, 'kernel': 'rbf'}
Fitting 5 folds for each of 160 candidates, totalling 800 fits
hyperparameter for RFC: {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 13}


[Parallel(n_jobs=1)]: Done 800 out of 800 | elapsed:    8.3s finished


In [8]:
# This function calculates the confusion matrix obtained after each individual variable is removed.

def FeatureRemover(feature: str, classifer: str) -> list: 
    """Removes the chosen column produces model which excludes the missing feature"""
    dataProRemoved = dataPro.drop([feature], axis = 1) 
    split = TrainTestSplit(d = dataProRemoved)
    k = Knn(split = split)
    s = Svc(split = split)
    r = Rfc(split = split)
    totalList = []
    
#creates the confusion matrix and prfs scores for the chosen classification technique 
    if classifer == "knn":
        
        confusion_matrix_knn[feature] = k.getKNNconfusion().tolist()
        totalList.append(k.getKNNprfs())

    elif classifer == "svc":
        confusion_matrix_svc[feature] = s.getSVCconfusion().tolist()
        totalList.append(s.getSVCprfs())
    
    elif classifer == "rfc":
        confusion_matrix_rfc[feature] = r.getRFCconfusion().tolist()
        totalList.append(r.getRFCscore())

    return totalList

comparison_list = pd.DataFrame([])
comparison_list.rename(columns={0: "Precision", 1: "Recall", 2: "F1 Score", 3: "Support"}, inplace = True)


# Iterate over each of the columns in dataPro and create the resultant confusion matrix after removing that column.
# Utilises FeatureRemover funciton to creates corresponding confusion matrices for all variables and their exclusions
for i in list(dataPro.drop(["Lowest distortion"], axis = 1)):
    v = FeatureRemover(i, "svc")
for i in list(dataPro.drop(["Lowest distortion"], axis = 1)):
    v = FeatureRemover(i, "knn")
for i in list(dataPro.drop(["Lowest distortion"], axis = 1)):
    v = FeatureRemover(i, "rfc")

heading = ["knn","svc","rfc"]
#A dataframe is created, where each row shows the new PRFS when that feature is removed from the model. This is repeated 10x, divided over by 10 to get an average.


def PRFS(model,comparison_list):
    for j in range(10):
            finalList = []
    
            #It iterates through each column in dataPro and drops it. It calculates the resultant precision, recall, F1 and support for each iteration and appends it to a list.
            for i in list(dataPro.drop(["Lowest distortion"], axis = 1)):
                finalList.append(FeatureRemover(i, model))
            
            comparison_list_new = pd.DataFrame([])
            
            # Each item in the list is added as a seperate row in a dataframe
            for i in range(len(finalList)):
                comparison_list_new = comparison_list_new.append(finalList[i], ignore_index = True)
            
            # Each item in the list is added as a seperate row in a dataframe
            comparison_list_new.rename(columns={0: "Precision", 1: "Recall", 2: "F1 Score", 3: "Support"}, inplace = True)
            p = list(dataPro.drop(["Lowest distortion"], axis = 1))
            
            # Renaming the rows
            for i in range(len(p)):
                comparison_list_new.rename(index = {i: p[i]}, inplace = True)
            comparison_list = comparison_list.add(comparison_list_new, fill_value = 0)
    
    # print table that has been produced in the previous lines for the given model
    print(f"The PRFS output for {model}:")
    table = comparison_list.div(10) #takes average of the 10 precision obtained through cross validation
    print(table)
    
# creates table for both svc and rfc with the missing variables
for x in range (1,3,1):
    PRFS(heading[x],comparison_list)
    
#prints precision scores for the overall model that has all variables included     
prfssvc = ex1.getSVCprfs()    
print(f"Overall SVC precision: {prfssvc[0]}, Recall: {prfssvc[1]}, F1 Score  {prfssvc[2]}") 

#adds the overall confusion matrix created using all the variables
#prints these matrices here for the readers inspection 
num = []
num.append(np.array(confusion_matrix_knn['Overall']))
num.append(np.array(confusion_matrix_svc['Overall']))
num.append(np.array(confusion_matrix_rfc['Overall']))
for x in range(len(heading)):
    print(f"Confusion matrix for {heading[x]}")
    print(num[x])

# Write the confusion matrix dictionaries into a JSON file so that it could be exported to Notebook 3.
with open("KNN_matrix.json", "w") as outfile1:
    json.dump(confusion_matrix_knn, outfile1)

with open("SVC_matrix.json", "w") as outfile2:
    json.dump(confusion_matrix_svc, outfile2)

with open("RFC_matrix.json", "w") as outfile3:
    json.dump(confusion_matrix_rfc, outfile3)
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The PRFS output for svc:
            Precision    Recall  F1 Score Support
v(A)         0.442222  0.519672  0.465264     NaN
r(AXII)(Å)   0.453060  0.525820  0.476808     NaN
r(AVI)(Å)    0.437268  0.512295  0.462791     NaN
EN(A)        0.435243  0.513934  0.460993     NaN
EN(B)        0.433855  0.523770  0.454327     NaN
l(A-O)(Å)    0.431185  0.515574  0.456050     NaN
l(B-O)(Å)    0.448082  0.525410  0.468648     NaN
ΔENR         0.430731  0.518443  0.456867     NaN
tG           0.433489  0.515164  0.456394     NaN
τ            0.440675  0.524180  0.464467     NaN
μ            0.440823  0.518033  0.462299     NaN
The PRFS output for rfc:
            Precision
v(A)         0.494672
r(AXII)(Å)   0.485656
r(AVI)(Å)    0.476230
EN(A)        0.474590
EN(B)        0.479508
l(A-O)(Å)    0.489754
l(B-O)(Å)    0.510246
ΔENR         0.484426
tG           0.485246
τ            0.486885
μ            0.497131
Overall SVC precision: 0.6709138169835165, Recall: 0.7049180327868853, F1 Score  0.655

  _warn_prf(average, modifier, msg_start, len(result))
