Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pathlib import Path

In [2]:
try:
    base_dir = Path(__file__).resolve().parent.parent
except NameError:
    base_dir = Path().resolve()

Importing data

In [14]:
output_dir = base_dir / "data" / "final"
df = pd.read_csv(output_dir / "3_bert_vectors_with_eco_grades.csv")
X = df.drop(columns=['ecoscore_grade']).values

1a. PCA Explained Variance - finding number of components that explains 95% variance

In [None]:
# X is my data matrix that contains the BERT embeddings
pca = PCA().fit(X)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance)+1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

# To find the number of components for 95% variance:
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components to explain 95% variance: {n_components_95}")

1b. PCA Dimensionality Reduction - perform PCA (Principal Component Analysis) on the BERT embedding data to reduce its dimensionality. Fit a PCA model with 34 components, transform the data, and create a new DataFrame containing these principal components along with the original ecoscore grade labels. Prepare resulting dataset for use in classification tasks by defining the feature matrix `X` and the target variable `y`.

In [16]:
# Fit PCA and transform
pca = PCA(n_components=34)
X_pca_34 = pca.fit_transform(X)

# Create a DataFrame with the 34 components
pca_columns = [f'pca_{i+1}' for i in range(34)]
df_pca = pd.DataFrame(X_pca_34, columns=pca_columns)

# Add ecoscore_grade column
df_pca['ecoscore_grade'] = df['ecoscore_grade'].values

# Defining X and y for modeling:
X = df_pca.drop('ecoscore_grade', axis=1)
y = df_pca['ecoscore_grade']

2a. KNN Hyperparameter Tuning & Model Development & Evaluation - split the PCA-transformed dataset into training and test sets, then perform hyperparameter tuning for the K-Nearest Neighbors (KNN) classifier using grid search with cross-validation. The best KNN model is selected based on accuracy, and its performance is evaluated on both the training and test sets using accuracy scores and a classification report.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Define Parameter Grid
param_grid = {
    'n_neighbors': range(5, 51, 2),  # Increased range for k
    'weights': ['uniform'],          # Simplified weight scheme
    'metric': ['minkowski'],         # Standard distance metric
    'p': [1, 2],                     # Euclidean distance
}

# Initialise and fit GridSearchCV
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Use best model for predictions
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

label_order = ['a-plus', 'a', 'b', 'c', 'd', 'e', 'f'] 

# Evaluate model performance
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", accuracy_score(y_train, y_train_pred))
print("Cross-validation accuracy:", grid_search.best_score_)
print("Test accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, labels=label_order))

2b. KNN - Confusion Matrix for Evaluation - compute and visualise the confusion matrix for the K-Nearest Neighbors (KNN) classifier's predictions on the test set. The confusion matrix provides a detailed breakdown of correct and incorrect classifications for each ecoscore grade, helping to assess the model's performance across all classes.

In [None]:
# Defining label order, with 'a-plus' first
label_order = ['a-plus', 'a', 'b', 'c', 'd', 'e', 'f']  

# y_test: true labels, y_pred: predicted labels
cm = confusion_matrix(y_test, y_test_pred, labels=label_order)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_order)
disp.plot(cmap='Blues')
plt.title('KNN Confusion Matrix')

3a. Random Forest Classifier Hyperparameter Tuning & Model Development & Evaluation - perform hyperparameter tuning for the Random Forest classifier using randomised search with cross-validation. The best model is selected based on accuracy, and its performance is evaluated on both the training and test sets using accuracy scores and a classification report.

In [None]:
# Define a reduced, randomised hyperparameter space
param_dist = {
    'n_estimators': randint(100, 300),               # Range of 100 to 300 trees
    'max_depth': [1, 10, 20, 30],                    # Depths to try
    'min_samples_split': [2, 5, 10],                 # Splits
    'min_samples_leaf': [1, 2, 4],                   # Leaf nodes
    'max_features': ['sqrt', 'log2', None],          # Feature selection strategies
    'bootstrap': [True],                             # Sampling method
    'criterion': ['gini', 'entropy']                 # Splitting criteria
}

# Define model
rf = RandomForestClassifier(random_state=42)

# Randomised Search CV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,                    # Try 50 random combinations
    cv=5,                         # 5-fold cross-validation
    scoring='accuracy',          
    n_jobs=-1,                    # Use all CPU cores
    verbose=1,                    # Print progress
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Evaluate
y_pred = random_search.predict(X_test)
y_train_pred = random_search.predict(X_train)
print("Best Parameters Found:\n", random_search.best_params_)
print("Training accuracy:", accuracy_score(y_train, y_train_pred))
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Best Cross-Validated Accuracy:", random_search.best_score_)
print("\nClassification Report on Test Set:\n", classification_report(y_test, y_pred))

3b. Random Forest Classifier Confusion Matrix - compute and visualise the confusion matrix for the Random Forest classifier's predictions on the test set. The matrix provides a detailed breakdown of the classifier's performance across all ecoscore grade classes, highlighting correct and incorrect predictions for each class.

In [None]:
# Defining label order, with 'a-plus' first
label_order = ['a-plus', 'a', 'b', 'c', 'd', 'e', 'f']  

# y_test: true labels, y_pred: predicted labels from Random Forest
cm = confusion_matrix(y_test, y_pred, labels=label_order)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_order)
disp.plot(cmap='Blues')
plt.title('Random Forest Confusion Matrix')

<b>References</b>

Random Forest Classifier
https://www.simplilearn.com/tutorials/machine-learning-tutorial/random-forest-algorithm#:~:text=A%20Random%20Forest%20Algorithm%20is%20a%20supervised,more%20trees%20more%20it%20will%20be%20robust. 

Random Forest Hyperparameter Tuning Using RandomizedSearchCV
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74/

KNN Hyperparameter Tuning Using GridSearchCV
https://sklearn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html