In [None]:
# %%
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [None]:
# %%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression

In [None]:
# %%
# Load the Dataset
df = pd.read_csv("Data/sample_results_dataset1.csv")
df.head()

In [None]:
# %%
# Mapping for A-level grades
a_level_points = {
    "A*": 60,
    "A": 50,
    "B": 40,
    "C": 30,
    "D": 20,
    "E": 10,
    "U": 0
}

df["GCE_A_Value"] = df["GCE A"].map(a_level_points)

# %%
# Mapping for AS-level grades
as_level_points = {
    "A": 25,
    "B": 20,
    "C": 15,
    "D": 10,
    "E": 5,
    "U": 0
}

df["GCE_AS_Value"] = df["GCE AS"].map(as_level_points)

In [None]:
# %%
# Standardize the Data (for clustering)
df[["GCE_A_ValueStan", "GCE_AS_ValueStan", "GCSEStan", "SATStan"]] = StandardScaler().fit_transform(
    df[["GCE_A_Value", "GCE_AS_Value", "GCSE", "SAT"]]
)
df.head()

In [None]:
# %%
# Visualise A levels and GCSE
plt.figure(figsize=(8,4))
plt.scatter(df["GCE_A_ValueStan"], df["GCSEStan"])
plt.title("GCE A and GCSE")
plt.xlabel("GCE A")
plt.ylabel("GCSE")
plt.show()

In [None]:
# %%
# Fit final KMeans model using chosen K
optimal_k = 3
kmeans = KMeans(
    n_clusters=optimal_k,
    init="k-means++",
    n_init="auto",
    random_state=0
)

kmeans.fit(df[["GCE_A_ValueStan", "GCE_AS_ValueStan", "GCSEStan", "SATStan"]])
df["Cluster"] = kmeans.labels_

In [None]:
# Plot clusters
plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=df,
    x="GCE_A_ValueStan",
    y="GCSEStan",
    hue="Cluster",
    palette="tab10",
    s=60
)

centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    s=200,
    c="black",
    marker="X",
    label="Centroids"
)

plt.title("K-Means Clustering of Students\n(GCE A-Level vs GCSE)")
plt.xlabel("GCE A-Level (Standardized)")
plt.ylabel("GCSE (Standardized)")
plt.legend()
plt.show()


In [None]:

# %%
# Find the optimal K Value
import warnings
warnings.filterwarnings('ignore')

TotVar = []
Silhouette = []

StartK = 2
EndK = 15

for K in range(StartK, EndK):
    kmeans = KMeans(
        n_clusters=K,
        init="k-means++",
        n_init="auto",
        random_state=0
    )
    
    kmeans.fit(df[["GCE_A_ValueStan", "GCSEStan"]])
    labels = kmeans.labels_
    
    Silhouette.append(silhouette_score(df[["GCE_A_ValueStan", "GCSEStan"]], labels))
    TotVar.append(kmeans.inertia_)


In [None]:

# %%
# Elbow plot
plt.figure(figsize=(8,4))
plt.plot(range(StartK, EndK), TotVar, color="red", marker="8")
plt.xlabel("K Value")
plt.xticks(np.arange(StartK, EndK,1))
plt.ylabel("Total variation")
plt.title("K Means Elbow plot\nCluster by total variation")
plt.show()


In [None]:

# %%
# Silhouette plot
plt.figure(figsize=(8,4))
plt.plot(range(StartK, EndK), Silhouette, color="blue", marker="8")
plt.xlabel("K Value")
plt.xticks(np.arange(StartK, EndK,1))
plt.ylabel("Silhouette Score")
plt.title("K Means Silhouette Score\nCluster by Silhouette Score")
plt.show()


In [None]:

# %%
# Table of results
LoopResults = pd.DataFrame(np.arange(StartK, EndK,1), columns=["K Value"])
LoopResults["Total Variation"] = TotVar
LoopResults["Silhouette Score"] = Silhouette
print(LoopResults)


In [None]:

# %%
# Final K-Means using K=8
kmeans = KMeans(n_clusters=8, init="k-means++", n_init="auto", random_state=0)
df["Cluster"] = kmeans.fit_predict(df[["GCE_A_ValueStan", "GCSEStan"]])
df.head()


In [None]:
plt.figure(figsize=(8,4))
plt.scatter(
    df["GCE_A_ValueStan"],
    df["GCE_AS_ValueStan"],
    s=df["SATStan"] * 20,      # scale size
    c=df["Cluster"],           # color by cluster
    cmap="tab10",
    alpha=0.7
)
plt.title("4D Cluster Plot: A-Level, AS-Level, SAT, GCSE")
plt.xlabel("GCE_A_ValueStan")
plt.ylabel("GCE_AS_ValueStan")
plt.show()


In [None]:
sns.pairplot(
    df,
    vars=["GCE_A_ValueStan", "GCE_AS_ValueStan", "SATStan", "GCSEStan"],
    hue="Cluster",
    palette="tab10"
)
plt.show()

In [None]:

# %%
# Visualise the 8 Clusters
plt.figure(figsize=(8,4))
plt.scatter(df["GCE_A_ValueStan"], df["GCSEStan"], c=df["Cluster"])
plt.title("GCE A/AS and GCSE SAT by Cluster")
plt.xlabel("GCE_A_ValueStan")
plt.ylabel("GCSEStan")
plt.show()

# -------------------------------------------------------------------
# ðŸš€ NEW SECTION: LOGISTIC REGRESSION TO PREDICT A GRADES
# -------------------------------------------------------------------

# %%
# Create binary target: 1 = A or A*, 0 = everything else
df["A_Grade"] = df["GCE A"].isin(["A", "A*"]).astype(int)


In [None]:
# %%
# Select predictors for logistic regression
features = ["GCSE", "GCE_AS_Value", "Cluster"]
X = df[features]
y = df["A_Grade"]


In [None]:
# %%
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


In [None]:
# %%
# Scale ALL predictors
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# %%
# Fit Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)


In [None]:
# %%
# Evaluate Model
y_pred = log_reg.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:

# %%
# Add predicted probability of achieving an A grade
df["Prob_A"] = log_reg.predict_proba(scaler.transform(df[features]))[:, 1]
df[["GCE A", "Prob_A"]].head()
