#Step 1: Applying PCA


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from itertools import product
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)

In [None]:
url = 'https://storage.googleapis.com/kagglesdsdata/datasets/5630996/9857367/StudentPerformanceFactors.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241115%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241115T000813Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=9e75b6bc9bb372ba9bd7ecd37c900dcd36ca8af1d55a432518f175cd7f92d5d2e3bfb66aa802512c61fc182b4fe109f664572053b7a6844f6a31a3a6dc9f6c82be1caa9dd854b0434057642c6551e9404e890721196476377b3e3093f8709027804f584c4de6c0c282a308e71c8288a25dedf0137fa09b3953ab82c96c77a9a5db3bcf2bad4b7fde3e5a033c5a8056894d091a3137db44a53e60f8266f84a3419f331c29282f152e3e62341031acbad9472ca5bee076cdb43ba2e623c59aa8c093341696d6a3e008b4a6d6fc1d015bf496040f037a35d3a6c21b141086ed8c855fd559a4e669d61494fd0c309b7b0aeb8ec28243b70839d42c214a2ffa794d35'

student_performance = pd.read_csv(url)

# Select only numeric columns for SVD
numeric_features = student_performance.select_dtypes(include=np.number)

numeric_features.head()



Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
0,23,84,7,73,0,3,67
1,19,64,8,59,2,4,61
2,24,98,7,91,2,4,74
3,29,89,8,98,1,4,71
4,19,92,6,65,3,4,70


In [None]:
#Standardize the dataset
std_scaler = StandardScaler()


features_scaled = std_scaler.fit_transform(numeric_features.to_numpy())
features_scaled = pd.DataFrame(features_scaled)
features_scaled.head()


Unnamed: 0,0,1,2,3,4,5,6
0,0.504942,0.348375,-0.019796,-0.1438,-1.213934,0.031411,-0.060578
1,-0.162822,-1.383736,0.661399,-1.11611,0.411451,1.001199,-1.602931
2,0.671882,1.560853,-0.019796,1.106313,0.411451,1.001199,1.738833
3,1.506587,0.781403,0.661399,1.592469,-0.401242,1.001199,0.967657
4,-0.162822,1.04122,-0.70099,-0.699406,1.224144,1.001199,0.710598


In [None]:
pca_U, pca_d, pca_V = np.linalg.svd(features_scaled)

In [None]:
pca_U.shape

(6607, 6607)

In [None]:
pca_V.shape

(7, 7)

In [None]:
pca_d.shape

(7,)

In [None]:
pca_d

array([107.98909409,  83.2836493 ,  82.16377657,  81.96092255,
        80.08257463,  79.25039299,  38.5858363 ])

In [None]:
prop_var = np.square(pca_d) / sum(np.square(pca_d))
pd.DataFrame(
    {"PC": 1 + np.arange(0, prop_var.shape[0]),
     "variability_explained": prop_var.round(2),
     "cumulative_variability_explained": prop_var.cumsum().round(2)
     }).head(7)

Unnamed: 0,PC,variability_explained,cumulative_variability_explained
0,1,0.25,0.25
1,2,0.15,0.4
2,3,0.15,0.55
3,4,0.15,0.69
4,5,0.14,0.83
5,6,0.14,0.97
6,7,0.03,1.0


In [None]:
# put the loadings for PC1 in a data frame
loadings1 = pd.DataFrame(
    {"factor": numeric_features.columns,
     "pc1_loading": pca_V[0]
     })
# look at the 10 largest (absolute value) loadings for PC1 but print out the signed value
loadings1.reindex(loadings1["pc1_loading"].abs().sort_values(ascending=False).index) \
    .head(10)

Unnamed: 0,factor,pc1_loading
6,Exam_Score,0.709101
1,Attendance,0.531986
0,Hours_Studied,0.408143
3,Previous_Scores,0.15959
4,Tutoring_Sessions,0.145445
2,Sleep_Hours,-0.027856
5,Physical_Activity,0.01369


In [None]:
# put the loadings for PC2 in a data frame
loadings2 = pd.DataFrame(
    {"factor": numeric_features.columns,
     "pc2_loading": pca_V[1]
     })
# look at the 10 largest (absolute value) loadings for PC2 but print out the signed value
loadings2.reindex(loadings2["pc2_loading"].abs().sort_values(ascending=False).index) \
    .head(10)

Unnamed: 0,factor,pc2_loading
0,Hours_Studied,0.541572
3,Previous_Scores,0.527548
4,Tutoring_Sessions,-0.45731
1,Attendance,-0.445008
2,Sleep_Hours,0.142395
5,Physical_Activity,-0.030674
6,Exam_Score,0.003396


In [None]:
# put the loadings for PC3 in a data frame
loadings3 = pd.DataFrame(
    {"factor": numeric_features.columns,
     "pc3_loading": pca_V[2]
     })
# look at the 10 largest (absolute value) loadings for PC3 but print out the signed value
loadings3.reindex(loadings3["pc3_loading"].abs().sort_values(ascending=False).index) \
    .head(10)

Unnamed: 0,factor,pc3_loading
5,Physical_Activity,0.727916
2,Sleep_Hours,0.390462
4,Tutoring_Sessions,0.320221
0,Hours_Studied,0.281651
3,Previous_Scores,-0.278149
1,Attendance,-0.241149
6,Exam_Score,0.017009


In [None]:
# create the PCA-transformed dataset

# multiply the original data and the PCA loadings
pca_scaled_x = numeric_features@pca_V.T
# make the data frame pretty and easier to work with by
# changing the column names to PC1, PC2, etc
pca_scaled_x.columns = ["PC" + str(1 + col) for col in pca_scaled_x.columns]

# look at the object
pca_scaled_x.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,113.080026,14.718736,-28.026803,-8.752404,3.592296,-56.672408,-19.248217
1,94.595589,13.243662,-18.779584,-7.430273,5.332006,-45.818299,-9.14612
2,129.076884,17.604536,-34.640494,-14.794682,6.732088,-70.152228,-25.489919
3,125.146251,28.59982,-32.989729,-18.273629,10.484206,-69.352305,-25.817781
4,117.031805,3.23719,-27.508261,-6.039219,0.667925,-56.13641,-18.951467


In [None]:
# plot PC1 vs PC2
px.scatter(pca_scaled_x, x="PC1", y="PC2",
           opacity=0.2,
           hover_name=student_performance.index)


#Step 2: Cumulative Variability Explained

In [None]:
prop_var = np.square(pca_d) / sum(np.square(pca_d))
pd.DataFrame(
    {"PC": 1 + np.arange(0, prop_var.shape[0]),
     "variability_explained": prop_var.round(2),
     "cumulative_variability_explained": prop_var.cumsum().round(2)
     }).head(7)

Unnamed: 0,PC,variability_explained,cumulative_variability_explained
0,1,0.25,0.25
1,2,0.15,0.4
2,3,0.15,0.55
3,4,0.15,0.69
4,5,0.14,0.83
5,6,0.14,0.97
6,7,0.03,1.0


#Step 3: Scree Plot

In [None]:
pca_d_explained = np.square(pca_d) / sum(np.square(pca_d))

In [None]:
px.line(x=np.arange(pca_d_explained.shape[0]),
        y=pca_d_explained,
        labels={"x": "PC",
                "y": "Proportion explained"})

#Step 4: Explain how you are using PCA in your project for dimensionality reduction or to learn structure in the data

We begin by filtering our dataset to only include numerical values, then standardizing the values such that the mean = 0 and standard deviation = 1 for each variable. We then apply SVD on the entire dataset. Analyzing the variability explained by each PC, we see that first principal component explains the most variability, with 25%, and the following 5 PCs each explain around 15%. Looking at the loading factors for PC 1, 2, and 3, we see significant overlap in the variables with higher loading factors. For example, tutoring sessions and attendence have significant loading factors in all 3. This tells us that PCA is not exactly "grouping" variables as we would like. Rather, every variable is contributing to the PCs with varying weights. Generating our PCA transformed dataset and plotting PC1 and PC2 reflects what we saw with our loading factors. More information is stored in PC1, as it has a visibly higher spread than PC2. Our scree plot shows two elbows, one at PC 1 and another at PC 6. However, only taking PC 1 and 2 would result in less than 50% of the variance being explained, so it would make more sense to take PC 1-6.

Overall, it seems that PCA failed to group variables together as we would like. We started with 7 numerical variables, and if we would like at least 90% of the variability explained, we would have to take 6 PCs. So PCA didn't help in reducing the dimensionality in any significant way. Therefore, it may make more sense to just use the original 7 variables, rather than apply PCA.

