In [1]:
# ============================================================
# PCA on California Housing Dataset (Google Colab Version)
# ============================================================

# Step 1: Upload the dataset to Google Colab
#from google.colab import files
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
# ------------------------------------------
# Upload CSV file from your computer
# ------------------------------------------
#uploaded = files.upload()
#filename = list(uploaded.keys())[0]  # get uploaded filename
#data = pd.read_csv(io.BytesIO(uploaded[filename]))

#print("File uploaded successfully!")
#print(f"Dataset shape: {data.shape}")
#print("Columns:", list(data.columns))

NameError: name 'files' is not defined

In [None]:
# ----------------------------------------------
# Step 2: Handle missing values
# ----------------------------------------------
print("\nChecking for missing values...")
print(data.isnull().sum())

# Fill missing numeric values with the mean of their column
data = data.fillna(data.mean(numeric_only=True))

print("\nMissing values handled (filled with column means).")

In [None]:
# ----------------------------------------------
# Step 3: Handle categorical data (ocean_proximity)
# ----------------------------------------------
# Convert the 'ocean_proximity' column (text) into numeric form
# using one-hot encoding (it creates separate columns for each category)
data_encoded = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)

# Use the following 2 lines if you want to see 0 and 1 in the DataFrame instead of False/True. Convert only dummy columns to int
dummy_cols = data_encoded.columns[data_encoded.dtypes == 'bool']
data_encoded[dummy_cols] = data_encoded[dummy_cols].astype(int)

# Check dataframe now
data_encoded.head()

In [None]:
# ----------------------------------------------
# Step 4: Standardize numeric columns (Standardization)
# ----------------------------------------------
# Subtract mean and divide by standard deviation
data_scaled_df = (data_encoded - data_encoded.mean()) / data_encoded.std()

# Check the result
data_scaled_df.head()

In [None]:
# ----------------------------------------------
# Step 5: Split into train and test subsets
# ----------------------------------------------
train_data, test_data = train_test_split(data_scaled_df, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")


In [None]:
# ------------------------------------------
# Step 6: Perform PCA
# ------------------------------------------
pca = PCA()
pca.fit(train_data)

# Transform both train and test sets
train_pca = pca.transform(train_data)
test_pca = pca.transform(test_data)


In [None]:
# ------------------------------------------
# Step 7: Show explained variance
# ------------------------------------------
explained_variance = pca.explained_variance_ratio_
cum_explained_variance = np.cumsum(explained_variance)

print("\nExplained Variance Ratio (first 10 PCs):")
for i, var in enumerate(explained_variance[:13]):
    print(f"PC{i+1}: {var:.4f}")

In [None]:
# ------------------------------------------
# Step 8: Scree Plot
# ------------------------------------------
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance)+1), cum_explained_variance, 'bo-', linewidth=2)
plt.title('Scree Plot (Cumulative Explained Variance)')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# ------------------------------------------
# Step 9: Choose top components (optional)
# ------------------------------------------
# You can select components that explain, e.g., 95% variance:
n_components_95 = np.argmax(cum_explained_variance >= 0.95) + 1
print(f"\n Number of components explaining 95% variance: {n_components_95}")

In [None]:
# ------------------------------------------
# Step 10: Save PCA Loadings with Attribute Names (sorted)

# Loadings = eigenvectors * sqrt(eigenvalues)
# They show how strongly each feature influences each principal component.
# ------------------------------------------
# Create loadings DataFrame
loadings_df = pd.DataFrame(
    pca.components_[:n_components_95].T,
    columns=[f'PC{i+1}' for i in range(n_components_95)],
    index=train_data.columns
)

# Reset index to show attribute names as a column
loadings_df.reset_index(inplace=True)
loadings_df.rename(columns={'index': 'Attribute'}, inplace=True)

# Sort by absolute loading strength for the first principal component (PC1) — or change to any PC
sorted_loadings = loadings_df.reindex(
    loadings_df['PC1'].abs().sort_values(ascending=False).index
)

# Save both unsorted and sorted loadings
loadings_df.to_csv('pca_loadings_unsorted.csv', index=False)
sorted_loadings.to_csv('pca_loadings_sorted_PC1.csv', index=False)

# Download from Colab
from google.colab import files
files.download('pca_loadings_sorted_PC1.csv')

print("\nPCA loadings sorted by strongest contributors to PC1 and saved successfully!")

In [None]:
# STEP 11: Model Evaluation
# Compute variance of each principal component in test data

# axis = 0 means, compute variance column-wise i.e., compute variance of each PC across all observations
test_pc_variance = np.var(test_pca, axis=0)

# Compute fraction of total variance explained
test_explained_variance_ratio = test_pc_variance / np.sum(np.var(test_data, axis=0))
test_cum_variance = np.cumsum(test_explained_variance_ratio)

#  Display results
explained_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(test_explained_variance_ratio))],
    'Explained_Variance_Ratio': test_explained_variance_ratio,
    'Cumulative_Explained_Variance': test_cum_variance
})

print("\n PCA Explained Variance on Test Data:")
print(explained_df.head(10))  # show first 10 PCs

# Optional: Scree plot


plt.figure(figsize=(8,5))
plt.plot(range(1, len(test_cum_variance)+1), test_cum_variance, 'ro-', linewidth=2)
plt.title('Scree Plot (Cumulative Explained Variance) - Test Data')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()