In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [186]:
df = pd.read_excel('property.xlsx')
df

Unnamed: 0,price,rooms,bathrooms,age
0,35,4,2,1
1,110,12,6,2
2,5,2,1,3
3,60,8,4,4
4,75,10,5,5
5,175,20,9,5
6,15,2,1,1
7,10,4,2,6
8,5,8,4,15
9,50,10,6,10


In [187]:
X = df.drop(columns=['price'])
X['const'] = 1
X

Unnamed: 0,rooms,bathrooms,age,const
0,4,2,1,1
1,12,6,2,1
2,2,1,3,1
3,8,4,4,1
4,10,5,5,1
5,20,9,5,1
6,2,1,1,1
7,4,2,6,1
8,8,4,15,1
9,10,6,10,1


## Calculate VIF for each feature

<img src='other/vif.png'>

In [188]:
# Define the vif function
def vif(X:pd.DataFrame)->pd.DataFrame:
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

In [189]:
results = vif(X=X)
results

Unnamed: 0,Feature,VIF
0,rooms,42.723454
1,bathrooms,44.148326
2,age,1.258131
3,const,4.394436


## Remove multicolinearity - by removing the feature

In [190]:
X_new = X.drop(columns=['bathrooms']).copy()
X_new

Unnamed: 0,rooms,age,const
0,4,1,1
1,12,2,1
2,2,3,1
3,8,4,1
4,10,5,1
5,20,5,1
6,2,1,1
7,4,6,1
8,8,15,1
9,10,10,1


In [191]:
results = vif(X=X_new)
results

Unnamed: 0,Feature,VIF
0,rooms,1.047487
1,age,1.047487
2,const,4.25636


## Remove multicolinearity - using PCA

In [192]:
def pca(X: pd.DataFrame, n_components: int):
    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(X)

    # Apply PCA
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(scaled_data)  # Fit and transform at once

    # Get the explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_

    # Display results
    print(f"Explained Variance Ratio: \n{explained_variance_ratio}")

In [193]:
pca_result = pca(X=X, n_components=1)

Explained Variance Ratio: 
[0.69849269]


In [194]:
pca(X=X, n_components=2)

Explained Variance Ratio: 
[0.69849269 0.2976552 ]


In [195]:
pca(X=X, n_components=3)

Explained Variance Ratio: 
[0.69849269 0.2976552  0.0038521 ]


In [196]:
'''
We see that with:
    - 1 component we explain ≈ 70%
    - 2 components we explain ≈ 70% and ≈ 30% 
    - 3 components we explain ≈ 70% and ≈ 30% and ≈ 0.3%  => one component is negligible!!!
'''

'\nWe see that with:\n    - 1 component we explain ≈ 70%\n    - 2 components we explain ≈ 70% and ≈ 30% \n    - 3 components we explain ≈ 70% and ≈ 30% and ≈ 0.3%  => one component is negligible!!!\n'

## Remove multicolinearity - using PCA without library

In [197]:
n_components=2

#### #1 Standardize data

<img src='other/stan.png'>

In [198]:
# Mean center the data
mean = np.mean(X, axis=0)
standardized_data = X - mean  # Subtract the mean from each column

#### #2 Compute covariance matrix

In [199]:
''' 
The fact that we transpose the matrix the rows become the columns = all combinations of products are captured!!!
'''

' \nThe fact that we transpose the matrix the rows become the columns = all combinations of products are captured!!!\n'

<img src='other/covariance.png'>

<img src='other/cov.png'>

In [None]:
# Calculate the covariance matrix
cov_matrix = np.cov(standardized_data.T)  # Covariance matrix
cov_matrix

array([[30.22222222, 14.        ,  5.11111111,  0.        ],
       [14.        ,  6.66666667,  3.11111111,  0.        ],
       [ 5.11111111,  3.11111111, 19.06666667,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

#### #3 Compute eigenvalues, eigenvectors

<img src='other/eigen.png'>

<img src='other/char_eq.png'>

In [201]:
# Calculate eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [202]:
eigenvalues

array([38.55527611,  0.1247185 , 17.27556094,  0.        ])

In [203]:
eigenvectors

array([[-0.86492488, -0.4163175 , -0.2803296 ,  0.        ],
       [-0.40821478,  0.90847118, -0.08967061,  0.        ],
       [-0.29200281, -0.03687634,  0.95570628,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ]])

#### #4 Sort eigenvalues λ and eigenvectors

In [204]:
''' 
When we perform PCA, we are effectively slicing away the weakest covariance combinations.
The ones that contribute the least to the overall variance in the dataset.
'''

' \nWhen we perform PCA, we are effectively slicing away the weakest covariance combinations.\nThe ones that contribute the least to the overall variance in the dataset.\n'

<img src='other/slice.png'>

In [205]:
# Sort the eigenvalues in descending order and sort eigenvectors accordingly
sorted_indices = np.argsort(eigenvalues)[::-1]  # Sort eigenvalues in descending order
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

In [206]:
sorted_indices

array([0, 2, 1, 3])

In [207]:
# Sorted eigenvalues from the highest values
sorted_eigenvalues

array([38.55527611, 17.27556094,  0.1247185 ,  0.        ])

In [208]:
# Sorted columns in the matrix based on indices of eigenvalues
sorted_eigenvectors

array([[-0.86492488, -0.2803296 , -0.4163175 ,  0.        ],
       [-0.40821478, -0.08967061,  0.90847118,  0.        ],
       [-0.29200281,  0.95570628, -0.03687634,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ]])

In [209]:
sorted_eigenvalues_sliced = sorted_eigenvalues[:n_components]
sorted_eigenvalues_sliced

array([38.55527611, 17.27556094])

#### #5 Explained variance 

<img src='other/variance.png'>

In [210]:
# Explained Variance Ratio
explained_variance_ratio = sorted_eigenvalues_sliced/ np.sum(sorted_eigenvalues)

# Display Results
print(f"Explained Variance Ratio: \n{explained_variance_ratio}")

Explained Variance Ratio: 
[0.68903393 0.30873719]
