### **Imports**

In [1]:
import os
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# PCA
from sklearn.decomposition import PCA

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
BLACK = '#252628'
DARK = BLACK
WHITE = '#d7d7d7'
FADE = '#8f8f8f'
LINK = '#3a6abb'
LINKHOVER = '#355da2'

### **Data**

In [3]:
portugese_df = pd.read_csv('../data/Portuguese.csv')

#### **Data Encoding**

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample DataFrame (replace with your actual DataFrame)
# Example: portugese_df = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Red', 'Blue']})

le = LabelEncoder()
encoding_mappings = {}  # Dictionary to store mappings for each column

# Loop through each column in the DataFrame
for column in portugese_df.columns:
    if portugese_df[column].dtype == 'object':
        portugese_df[column] = le.fit_transform(portugese_df[column])
        encoding_mappings[column] = {index: label for index, label in enumerate(le.classes_)}

In [5]:
encoding_mappings

{'school': {0: 'GP', 1: 'MS'},
 'sex': {0: 'F', 1: 'M'},
 'address': {0: 'R', 1: 'U'},
 'famsize': {0: 'GT3', 1: 'LE3'},
 'Pstatus': {0: 'A', 1: 'T'},
 'Mjob': {0: 'at_home', 1: 'health', 2: 'other', 3: 'services', 4: 'teacher'},
 'Fjob': {0: 'at_home', 1: 'health', 2: 'other', 3: 'services', 4: 'teacher'},
 'reason': {0: 'course', 1: 'home', 2: 'other', 3: 'reputation'},
 'guardian': {0: 'father', 1: 'mother', 2: 'other'},
 'schoolsup': {0: 'no', 1: 'yes'},
 'famsup': {0: 'no', 1: 'yes'},
 'paid': {0: 'no', 1: 'yes'},
 'activities': {0: 'no', 1: 'yes'},
 'nursery': {0: 'no', 1: 'yes'},
 'higher': {0: 'no', 1: 'yes'},
 'internet': {0: 'no', 1: 'yes'},
 'romantic': {0: 'no', 1: 'yes'}}

In [6]:
X = portugese_df.drop('G3', axis=1) # We are predicting the final grade
# Also drop G1 and G2 as they are highly correlated with G3
X = X.drop('G1', axis=1)
X = X.drop('G2', axis=1)

y = portugese_df['G3']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### **Standardisation**

In [8]:
# 1. Subtract the mean from each variable
# 2. Divide by the standard deviation
X_s = X.copy()

for column in X.columns:
    X_s[column] = (X[column] - X[column].mean()) / X[column].std()

#### **Implementation**

In [9]:
accepted_v1 = 0.90
accepted_v2 = 0.95

In [10]:
pca = PCA()
pca.fit(X_s)
print(pca.explained_variance_ratio_)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components_1 = np.argmax(cumulative_variance >= accepted_v1) + 1
n_components_2 = np.argmax(cumulative_variance >= accepted_v2) + 1

[0.10305697 0.08347132 0.05887174 0.04769355 0.04407329 0.04346465
 0.04224807 0.04149283 0.03569515 0.03439979 0.03389327 0.03274096
 0.03131218 0.03036914 0.02985879 0.02895007 0.02851811 0.02679844
 0.02406105 0.02336367 0.02234898 0.02206466 0.02178356 0.02068355
 0.01907987 0.01685936 0.01634479 0.01612912 0.01099576 0.0093773 ]


In [11]:
# Plot cumulative explained variance
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(1, len(cumulative_variance) + 1), y=cumulative_variance))
fig.add_trace(go.Scatter(x=[n_components_1, n_components_1], y=[0, 1], mode='lines', name='90% Explained Variance'))
fig.add_trace(go.Scatter(x=[n_components_2, n_components_2], y=[0, 1], mode='lines', name='95% Explained Variance'))
fig.update_layout(
    title='PCA Explained Variance',
    # bold
    title_font=dict(size=18, weight='bold'),
    xaxis_title='Number of Components',
    yaxis_title='Cumulative Explained Variance',
    showlegend=False,
#   white bg
    paper_bgcolor='rgba(255,255,255,1)',
    plot_bgcolor='rgba(255,255,255,1)',
    # all xticks
    xaxis=dict(tickvals=np.arange(0, len(cumulative_variance) + 1, 5), gridcolor='rgba(0,0,0,0.1)'),
    yaxis=dict(gridcolor='rgba(0,0,0,0.1)'),
    margin=dict(l=20, r=20, t=80, b=20)
)

# Add annotation for 90% explained variance
fig.add_annotation(
    x=n_components_1,
    y=accepted_v1,
    text=f'90% explained variance, {n_components_1} Components',
    showarrow=True,
    arrowhead=5,
    ax=-100,
    ay=-50
)

# Add annotation for 95% explained variance
fig.add_annotation(
    x=n_components_2,
    y=accepted_v2,
    text=f'95% explained variance, {n_components_2} Components',
    showarrow=True,
    arrowhead=5,
    ax=-100,
    ay=-50
)

fig.show()

#### **Finding Influential Features**

In [12]:
for i in range(5): # Top 3 components
    idx = np.argsort(pca.components_[i], axis=0)
    print(f'Top 5 features for component {i + 1}: {list(X.columns[idx[:5]])}')

Top 5 features for component 1: ['school', 'traveltime', 'failures', 'age', 'Dalc']
Top 5 features for component 2: ['studytime', 'higher', 'schoolsup', 'school', 'nursery']
Top 5 features for component 3: ['Pstatus', 'traveltime', 'school', 'sex', 'higher']
Top 5 features for component 4: ['freetime', 'activities', 'famrel', 'Pstatus', 'famsup']
Top 5 features for component 5: ['paid', 'failures', 'Fjob', 'Pstatus', 'health']


#### **Relationship between features and label**

In [13]:
g3_correlation = pd.read_csv('../data/g3_abs_correlation.csv')
g3_correlation = g3_correlation[~g3_correlation['feature'].isin(['G3', 'G2', 'G1'])]

In [33]:
# Create subplots with a specific number of rows and columns
sorted_features = g3_correlation['feature']
fig = make_subplots(
    rows=10,
    cols=3,
    subplot_titles=list(sorted_features),
    vertical_spacing=0.04,
)

# Loop through each feature to create a scatter plot in the corresponding subplot
for i, feature in enumerate(sorted_features):
    row = i // 3 + 1  # Determine the row number (1-indexed)
    col = i % 3 + 1   # Determine the column number (1-indexed)
    
    jitter_strength = 0.05
    fig.add_trace(
        go.Scatter(
            x=X[feature] + np.random.normal(0, jitter_strength, len(X)), 
            y=y + np.random.normal(0, jitter_strength, len(y)), 
            mode='markers', 
            opacity=1,
            marker=dict(size=2, color=BLACK),
        ), 
        row=row,  # Specify the row
        col=col   # Specify the column
    )

m = 40

# Update layout for the entire figure
fig.update_layout(
    title='Feature vs G3 Scatter Plots<br><i>Jittered, in order of absolute correlation with G3</i>',
    # subtitle
    
    title_font=dict(size=24, color=BLACK, weight='bold'),
    showlegend=False,
    paper_bgcolor='rgba(255,255,255,1)',
    plot_bgcolor='rgba(255,255,255,1)',
    margin=dict(l=m, r=m, t=m*4, b=m, pad=5),
    height=3000,
)

# fig.update_layout(font=dict(family='Roboto'))

# Save to html
fig.write_html('../../docs/scatter_plots.html')

#### **Training a Model on the Reduced Dataset**

In [15]:
from sklearn.linear_model import LinearRegression

pca_1 = PCA(n_components=n_components_1)
pca_2 = PCA(n_components=n_components_2)

X_train_pca_1 = pca_1.fit_transform(X_train)
X_test_pca_1 = pca_1.transform(X_test)
X_train_pca_2 = pca_2.fit_transform(X_train)
X_test_pca_2 = pca_2.transform(X_test)

pca_model_1 = LinearRegression()
pca_model_2 = LinearRegression()
normal_model = LinearRegression()

pca_model_1.fit(X_train_pca_1, y_train)
pca_model_2.fit(X_train_pca_2, y_train)
normal_model.fit(X_train, y_train)

In [16]:
pca_model_1.score(X_test_pca_1, y_test)

0.21779974454536022

In [17]:
pca_model_2.score(X_test_pca_2, y_test)

0.20122427601962534

In [18]:
normal_model.score(X_test, y_test)

0.20706667288940095

In [19]:
# Check model sizes, in KB
import pickle
import os

normal_model_size = sys.getsizeof(pickle.dumps(normal_model))
pca_model_1_size = sys.getsizeof(pickle.dumps(pca_model_1))
pca_model_2_size = sys.getsizeof(pickle.dumps(pca_model_2))

normal_model_size_kb = normal_model_size / 1024
pca_model_1_size_kb = pca_model_1_size / 1024
pca_model_2_size_kb = pca_model_2_size / 1024

print(f'Normal Model Size: {normal_model_size_kb} KB')
print(f'PCA Model 1 Size: {pca_model_1_size_kb} KB')
print(f'PCA Model 2 Size: {pca_model_2_size_kb} KB')

Normal Model Size: 1.2529296875 KB
PCA Model 1 Size: 0.8017578125 KB
PCA Model 2 Size: 0.8486328125 KB


In [20]:
results_df = pd.DataFrame({
    'model': ['Normal', 'PCA 90%', 'PCA 95%'],
    'accuracy': [normal_model.score(X_test, y_test), pca_model_1.score(X_test_pca_1, y_test), pca_model_2.score(X_test_pca_2, y_test)],
    'model_size (KB)': [normal_model_size_kb, pca_model_1_size_kb, pca_model_2_size_kb],
    'relative_accuracy': [1, pca_model_1.score(X_test_pca_1, y_test) / normal_model.score(X_test, y_test), pca_model_2.score(X_test_pca_2, y_test) / normal_model.score(X_test, y_test)]
})

results_df = results_df.sort_values('accuracy', ascending=False)

In [21]:
results_df

Unnamed: 0,model,accuracy,model_size (KB),relative_accuracy
1,PCA 90%,0.2178,0.801758,1.051834
0,Normal,0.207067,1.25293,1.0
2,PCA 95%,0.201224,0.848633,0.971785
