In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Ridge, Lasso
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from utils_milestone2 import get_numeric_non_infinite_cols, add_pca_cols, scale_train_test, pca_train_test


### Load the Data

In [2]:
X_train = pd.read_pickle("data\\milestone_data_X_train.pkl")
y_train = pd.read_pickle("data\\milestone_data_y_train.pkl")
X_test = pd.read_pickle("data\\milestone_data_X_test.pkl")
y_test = pd.read_pickle("data\\milestone_data_y_test.pkl")

In [3]:
X_val = pd.read_pickle("data\\milestone_data_X_val.pkl")

In [4]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (221122, 1095)
y_train shape (221122,)
X_test shape (32257, 1095)
y_test shape (32257,)


### Data Preprocessing
Check there are no nulls

In [5]:
is_NaN = X_train.isnull()
col_has_NaN = is_NaN.any(axis=0)
col_has_NaN = col_has_NaN.loc[col_has_NaN==True].index.to_list()
col_has_NaN

[]

Get numeric columns that do not contain infinity

In [6]:
%%time
X_train_numeric, cols = get_numeric_non_infinite_cols(X_train)
X_test_numeric = X_test[cols]

Wall time: 1.21 s


### Normalization

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# %%time
# scaler = StandardScaler()
# # fit on the train data only
# scaler.fit(X_train_numeric)
# # transform train and test
# X_train_numeric_scaled = scaler.transform(X_train_numeric)
# X_test_numeric_scaled = scaler.transform(X_test_numeric)

In [9]:
%%time
X_train_numeric_scaled, X_test_numeric_scaled = scale_train_test(X_train_numeric, X_test_numeric)

Wall time: 5.95 s


### Dimensionality Reduction

In [10]:
from sklearn.decomposition import PCA

In [11]:
X_train_numeric.shape

(221122, 1089)

In [12]:
# %%time
# pca = PCA(n_components=200, random_state=2021)
# # fit on scaled train data
# pca.fit(X_train_numeric_scaled)
# # transform scaled train and scaled test
# X_train_pca = pca.transform(X_train_numeric_scaled)
# X_test_pca = pca.transform(X_test_numeric_scaled)
# print("Total Explained", sum(pca.explained_variance_ratio_))

In [13]:
%%time
X_train_pca, X_test_pca = pca_train_test(X_train_numeric_scaled, X_test_numeric_scaled, num_components=200, random_state=2021)

Total Explained 0.9995072361165935
Wall time: 21.2 s


In [14]:
X_train_pca.shape[1]

200

In [15]:
X_train_numeric.shape

(221122, 1089)

In [16]:
%%time
X_train_numeric_plus_pca = add_pca_cols(X_train_numeric, X_train_pca)
X_test_numeric_plus_pca = add_pca_cols(X_test_numeric, X_test_pca)

Wall time: 1.23 s


In [17]:
X_train_numeric_plus_pca.shape

(221122, 1289)