In [None]:
# Step 1: Import Required Libraries
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 2: Upload the ZIP file
print("ðŸ“‚ Please upload your dataset ZIP file (e.g., archive.zip)")
uploaded = files.upload()

# Step 3: Extract ZIP file
for zip_filename in uploaded.keys():
    print(f"Extracting {zip_filename}...")
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall("letter_data")

print("\nâœ… Files inside extracted folder:")
print(os.listdir("letter_data"))

# Step 4: Load Dataset
df = pd.read_csv("letter_data/letter-recognition.csv", header=None)

# Step 5: Assign Column Names
col_names = [
    "letter", "x-box", "y-box", "width", "high", "onpix",
    "x-bar", "y-bar", "x2bar", "y2bar", "xybar", "x2ybr",
    "xy2br", "x-ege", "xegvy", "y-ege", "yegvx"
]
df.columns = col_names

# Remove the header row which was loaded as data
df = df.iloc[1:].copy()


print("\nâœ… Dataset Loaded Successfully")
print("Shape of dataset:", df.shape)
print("\nSample Data:")
print(df.head())
print("\nDataset Info:")
print(df.info())

# Step 6: Separate features and labels
X = df.drop("letter", axis=1)
y = df["letter"]

# Convert feature columns to numeric, coercing errors
for col in X.columns:
  X[col] = pd.to_numeric(X[col], errors='coerce')

# Step 7: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Correlation Matrix BEFORE PCA
plt.figure(figsize=(10, 8))
sns.heatmap(pd.DataFrame(X_scaled, columns=X.columns).corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation BEFORE PCA")
plt.show()

# Step 9: Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 10: Correlation Matrix AFTER PCA
plt.figure(figsize=(10, 8))
sns.heatmap(pd.DataFrame(X_pca).corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation AFTER PCA")
plt.show()

# Step 11: Explained Variance Visualization
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Explained Variance by PCA Components")
plt.grid()
plt.show()

# Step 12: Find number of components for 95% variance
for i, var in enumerate(cumulative_variance, start=1):
    if var >= 0.95:
        print(f"\nâœ… {i} components are sufficient to explain 95% of the variance.")
        break

# Step 13: Reduce to 2D using PCA
pca_2d = PCA(n_components=2)
X_reduced = pca_2d.fit_transform(X_scaled)

# Step 14: 2D Scatter Plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=y, palette="tab20", s=30, legend=False)
plt.title("Dataset Reduced to 2D using PCA")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

# Step 15: Print Explained Variance
print("\nExplained variance ratio per component:\n", explained_variance)
print("\nCumulative variance ratio:\n", cumulative_variance)