In [13]:
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import statsmodels.api as sm
from sklearn.manifold import TSNE


In [14]:
# load data from processed_data.parquet
df = pd.read_parquet("processed_data.parquet")

## Feature Selection 

<br /> **Principal Component Analysis (PCA)**: Dimensionality reduction technique used to reduce the dimensionality of a dataset while preserving as much variance as possible. It does this by transforming the original variables into a new set of variables called principal components, which are linear combinations of the original variables. Requires numerical encoding!

<br /> **K-mean clustering**: Unsupervised machine learning algorithm used for clustering data into groups or clusters based on similarities in their feature space. It aims to partition the data into k clusters, where each data point belongs to the cluster with the nearest mean (centroid). Requires numerical encoding!

<br /> **t-Distributed Stochastic Neighbor Embedding (t-SNE)**: Visualizing high-dimensional data in lower-dimensional spaces, often in two or three dimensions. Preserves the local structure of the data points, meaning that similar data points in the high-dimensional space should remain close to each other in the low-dimensional embedding. Requires numerical encoding!

<br /> **Latent Class Analysis (LCA)**: Identifies unobserved or latent subgroups within a population based on patterns of observed categorical variables. It belongs to the family of finite mixture models, where each latent class represents a distinct subgroup with its own characteristic response probabilities for the observed variables. Requires numerical encoding!

#### PCA

In [15]:
X = df.drop(['target'], axis=1).values

pca = PCA(n_components=3)
pca.fit(X)


loadings_abs = pd.DataFrame(abs(pca.components_.T), columns=['PC1_loading', 'PC2_loading', 'PC2_loading'], index=df.columns[:-1])

# Sum the absolute loadings across the two principal components
loadings_abs['total_loading'] = loadings_abs.sum(axis=1)

top_features = loadings_abs.sort_values(by='total_loading', ascending=False).head(8).index.tolist()
top_features

['age_ind',
 'district_y',
 'crash_hour_ind',
 'posted_speed_limit_ind',
 'street_direction_ind_S',
 'street_direction_ind_N',
 'lighting_condition_ind_DAYLIGHT',
 'crash_day_of_week_ind']