# Reducing the dimensionnality of the datae

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os

# Dynamically add the 'utils' folder to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../source/')))

# Now, you can import the functions
from utils.pca import pca


In [2]:
df = pd.read_csv("../../data/cleaned_data/numeric_covariates.csv")

In [3]:
# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
print(f"Non-numeric columns: {non_numeric_cols}")

# Drop non-numeric columns
data = df.drop(columns=non_numeric_cols)

Non-numeric columns: Index([], dtype='object')


In [4]:
# Verify if their are nan values 
print(data.isna().any(axis=1))

# Handle NaNs (e.g., replace with mean)
data = data.fillna(data.mean())

# Handle NaNs (e.g., replace with mean)
print(data.isna().any(axis=1))

0        True
1       False
2       False
3       False
4       False
        ...  
2126    False
2127    False
2128    False
2129    False
2130    False
Length: 2131, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
2126    False
2127    False
2128    False
2129    False
2130    False
Length: 2131, dtype: bool


In [5]:
X = df.to_numpy()
X

array([[0.0000e+00, 1.9980e+03, 4.4400e+02, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 1.2859e+04, 1.8840e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+00, 5.6780e+03, 1.1970e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [2.1280e+03, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.1290e+03, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.1300e+03, 7.6700e+02, 9.7000e+01, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [6]:
transformed_data, top_eigenvectors, explained_variance = pca(X, n_components=4)


Centered data : [[-1.06500000e+03             nan             nan ...  3.52885969e-01
  -5.67808541e-02 -7.03894885e-03]
 [-1.06400000e+03             nan             nan ...  3.52885969e-01
  -5.67808541e-02 -7.03894885e-03]
 [-1.06300000e+03             nan             nan ...  3.52885969e-01
  -5.67808541e-02 -7.03894885e-03]
 ...
 [ 1.06300000e+03             nan             nan ... -6.47114031e-01
  -5.67808541e-02 -7.03894885e-03]
 [ 1.06400000e+03             nan             nan ... -6.47114031e-01
  -5.67808541e-02 -7.03894885e-03]
 [ 1.06500000e+03             nan             nan ... -6.47114031e-01
  -5.67808541e-02 -7.03894885e-03]]
Covariance matrix : [[ 3.78607667e+05             nan             nan ... -1.71347887e+02
   2.39732394e+01  6.78403756e-01]
 [            nan             nan             nan ...             nan
              nan             nan]
 [            nan             nan             nan ...             nan
              nan             nan]
 ...
 [-1.713

In [7]:
transformed_data

array([[nan, nan, nan, nan],
       [nan, nan, nan, nan],
       [nan, nan, nan, nan],
       ...,
       [nan, nan, nan, nan],
       [nan, nan, nan, nan],
       [nan, nan, nan, nan]])