In [1]:
import itertools

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

In [2]:
import warnings
warnings.filterwarnings('ignore')

### 1. Loading the train and test data

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [4]:
X_train = df_train.drop(['TARGET'], axis = 1)
y_train = df_train['TARGET']

In [5]:
X_train.shape, df_test.shape

((76020, 370), (75818, 370))

##### We notice that there are a huge number of features, hence, we proceed with Feature Selection to choose only the ones that contribute to the need.

### 2. Feature Selection

With the dataset with so many features, we might run into problems such as:

<ol>
<li><b>Constant features:</b></li>
<ul>
    <li>Features that show the same value, one value, for all the observations of the dataset.</li>
    <li>They provide no information that allows a machine learning model to predict the target.</li>
    <li>To identify constant features, we are using VarianceThreshold from sklearn and remove them.</li>
</ul>
    
<h6></h6>
    
<li><b>Quasi-Constant features:</b></li>
<ul>
    <li>Quasi-constant features have the same value for the majority of observations in a dataset. </li>
    <li>These types of features typically do not provide much information for machine learning models to predict or classify a target.</li>
    <li>To identify quasi-constant features, we are again going to use VarianceThreshold from sklearn and remove them.</li>
</ul>

In [6]:
#Fitting the VarianceThreshold

vthres = VarianceThreshold(threshold = 0.01)
vthres.fit(X_train)

In [7]:
#Number of features that are non-constant

print("Originial Features: ",X_train.shape[1])
print("Non-Constant Features: ", sum(vthres.get_support()))

Originial Features:  370
Non-Constant Features:  273


In [8]:
# printing the constant features

print(len([
        x for x in X_train.columns
        if x not in X_train.columns[vthres.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[vthres.get_support()]]

97


['ind_var1',
 'ind_var2_0',
 'ind_var2',
 'ind_var6_0',
 'ind_var6',
 'ind_var13_largo',
 'ind_var13_medio_0',
 'ind_var13_medio',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var29_0',
 'ind_var29',
 'ind_var30_0',
 'ind_var31_0',
 'ind_var31',
 'ind_var32_cte',
 'ind_var32_0',
 'ind_var32',
 'ind_var33_0',
 'ind_var33',
 'ind_var34_0',
 'ind_var34',
 'ind_var40',
 'ind_var41',
 'ind_var39',
 'ind_var44_0',
 'ind_var44',
 'ind_var46_0',
 'ind_var46',
 'num_var6_0',
 'num_var6',
 'num_var13_medio_0',
 'num_var13_medio',
 'num_var18_0',
 'num_var18',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var29_0',
 'num_var29',
 'num_var33',
 'num_var34_0',
 'num_var34',
 'num_var41',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var41',
 'saldo_var46',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb

> We can see the above 34 columns have constant value hence we are going to get rid of them, as they make no contributions.

In [9]:
# Dropping the columns from train and test sets

X_train = vthres.transform(X_train)
df_test = vthres.transform(df_test)

In [10]:
# Checking the updated shape of train and test data

X_train.shape, df_test.shape

((76020, 273), (75818, 273))

<span style = 'font-size:16px; font-family:TimesNewRoman'> By removing constant and quasi-constant features, we reduced the feature space from 370 to 273. We can see that 97 features were removed from the present dataset.</span>

### PCA

PCA stands for Principal Component Analysis. It is a statistical technique used to reduce the dimensionality of a dataset while retaining as much of the variation in the data as possible.

PCA works by transforming a dataset consisting of many variables into a smaller set of variables called principal components. The principal components are linear combinations of the original variables that capture the maximum amount of variation in the data.

The first principal component captures the most variation in the data, the second principal component captures the second most variation, and so on. By retaining only the top principal components, PCA can reduce the number of variables in the dataset while still preserving most of the information.

PCA is often used in data preprocessing, data visualization, and machine learning. It can be used to identify patterns and relationships in the data, to reduce noise, and to create more parsimonious models.

In [61]:
# Create a PCA object and fit your data
pca = PCA(whiten = True, svd_solver='auto', iterated_power='auto', n_oversamples=10, power_iteration_normalizer='auto', random_state=None).fit(X_train)

# Get the explained variance ratio for each component
variance_ratio = pca.explained_variance_ratio_
#print(variance_ratio)

# Get the cumulative sum of explained variance ratio
cumulative_variance_ratio = np.cumsum(variance_ratio)
#print(cumulative_variance_ratio)

# Find the number of components that explain 95% of the variance
n_components = np.argmax(cumulative_variance_ratio >= 0.95)

# Transform the data to the selected number of components
X_transformed = PCA(n_components=n_components).fit_transform(X_train)

# Print the selected number of components
print("Number of components selected:", n_components)

Number of components selected: 5
