In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import prince
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns

In [None]:
data = pd.read_csv('imputed_dataset_2.csv')

In [None]:
print(f"Unique values in workclass: {data['workclass'].unique()}: {data['workclass'].unique().size} unique values\n")
print(f"Unique values in occupation: {data['occupation'].unique()}: {data['occupation'].unique().size} unique values")

**One hot encoding (workclass and occupation)**

In [None]:
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())

In [None]:
correlation_matrix = encoded_columns_df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(correlation_matrix)
plt.title('Correlation Matrix of One-Hot Encoded Features')
plt.show()

****Dimensionality reduction Phase**** <br>
Testing MCA vs PCA

**MCA**


In [None]:
mca = prince.MCA(
    one_hot=False,
    n_components=23
    )

mca_coords = mca.fit_transform(encoded_columns_df)

In [None]:
mca.eigenvalues_summary

**MCA separately for workclass and occupation <br>**
Testing if using MCA separately for the workclass features and occupation is improving the descriptive performances

In [None]:
# Separate the encoded columns
workclass_cols = [col for col in encoded_columns_df.columns if 'workclass' in col]
occupation_cols = [col for col in encoded_columns_df.columns if 'occupation' in col]

# Create two separate MCAs
mca_workclass = prince.MCA(n_components=8)
mca_occupation = prince.MCA(n_components=15)

# Fit and transform separately
workclass_transformed = mca_workclass.fit_transform(encoded_columns_df[workclass_cols])
occupation_transformed = mca_occupation.fit_transform(encoded_columns_df[occupation_cols])

# Check eigenvalues summaries
print("Workclass Components:")
print(mca_workclass.eigenvalues_summary)
print("\nOccupation Components:")
print(mca_occupation.eigenvalues_summary)

**PCA**

In [None]:
pca_net = PCA(n_components=10)

pca_result_net = pca_net.fit_transform(encoded_columns_df)

pca_columns = [f'pca_component_{i+1}' for i in range(10)]

pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)

In [None]:
pca = PCA(n_components=22)

pca_result = pca.fit_transform(encoded_columns_df)

# Let's look at the explained variance ratios
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

for i, (var, cum_var) in enumerate(zip(explained_variance_ratio[:22], cumulative_variance_ratio[:22]), 1):
    print(f"Component: {i}, Variance Explained: {var * 100:.2f}%, Cumulative Variance Explained: {cum_var * 100:.2f}%")

In [None]:
# Get the eigenvalues summary dataframe and convert percentage column to numeric
eigenvalues_summary = mca.eigenvalues_summary
cumulative_variance = pd.to_numeric(eigenvalues_summary['% of variance (cumulative)'].str.rstrip('%')) / 100

plt.figure(figsize=(10, 6))

# Plot both MCA and PCA with different line styles and some transparency
plt.plot(range(1, len(eigenvalues_summary) + 1),
         cumulative_variance, 'ro-', label='MCA', alpha=0.7,
         linestyle='--',  
         linewidth=1,     
         markersize=8)   

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         np.cumsum(pca.explained_variance_ratio_), 'bo-', label='PCA', alpha=0.7,
         linestyle='-.',  
         linewidth=1,     
         markersize=8)    

plt.xlabel('Number of Components', fontsize=15)
plt.ylabel('Cumulative Explained Variance Ratio', fontsize=15)
plt.title('Comparison of MCA and PCA Cumulative Explained Variance', 
          fontsize=19, fontweight="bold")
plt.grid(True, alpha=0.9)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()

**Integrating component columns in the dataset**

In [None]:
data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
data = pd.concat([data, pca_df], axis=1) #add the PCA components

In [None]:
data.to_csv('dataset.csv')