In [1]:
from helpers import load_csv_data
from implementations import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

data_folder = './data/'

# Loading data from CSV files

In [5]:
x_train, x_test, y_train, train_ids, test_ids, feature_names, default_values = load_csv_data(data_folder, sub_sample=True)

print("Number of training samples: ", x_train.shape[0]
      , "\nNumber of test samples: ", x_test.shape[0]
      , "\nNumber of features: ", x_train.shape[1])

Number of training samples:  500 
Number of test samples:  500 
Number of features:  321


In [3]:
# Pandas version
# drop first column (ids)
df_x_train = pd.read_csv(data_folder + 'x_train.csv').drop(columns=['Id'])
df_y_train = pd.read_csv(data_folder + 'y_train.csv').drop(columns=['Id'])
df_x_test = pd.read_csv(data_folder + 'x_test.csv').drop(columns=['Id'])

# Pandas version
print(df_x_train.info())
print(df_x_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328135 entries, 0 to 328134
Columns: 321 entries, _STATE to _AIDTST3
dtypes: float64(244), int64(77)
memory usage: 803.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109379 entries, 0 to 109378
Columns: 321 entries, _STATE to _AIDTST3
dtypes: float64(237), int64(84)
memory usage: 267.9 MB
None


# Preprocessing

## Replace default values in dataset by NaN

In [None]:
print("Default values for _PRACE1:", default_values['_PRACE1'])
print("Value of _PRACE1 for sample 9:", x_train[9,feature_names == '_PRACE1'])
print("Value of _PRACE1 for sample 202:", x_train[202,feature_names == '_PRACE1'])

for i, feature in enumerate(feature_names):
    for default_value in default_values[feature]:
        x_train[x_train[:, i] == default_value, i] = np.nan
        x_test[x_test[:, i] == default_value, i] = np.nan

print("Value of _PRACE1 for sample 9:", x_train[9,feature_names == '_PRACE1'])
print("Value of _PRACE1 for sample 202:", x_train[202,feature_names == '_PRACE1'])

In [None]:
# With Pandas
print("Value of _PRACE1 for sample 9:", df_x_train.loc[9, '_PRACE1'])
print("Value of _PRACE1 for sample 202:", df_x_train.loc[202, '_PRACE1'])

for i, feature in enumerate(feature_names):
    # Replace default values with NaN
    for default_value in default_values[feature]:
        df_x_train.loc[df_x_train[feature] == default_value, feature] = np.nan
        df_x_test.loc[df_x_test[feature] == default_value, feature] = np.nan
        
print("Value of _PRACE1 for sample 9:", df_x_train.loc[9, '_PRACE1'])
print("Value of _PRACE1 for sample 202:", df_x_train.loc[202, '_PRACE1'])

In [None]:
x_train, x_test = min_max_normalize(x_train, x_test)

In [None]:
# Correlation matrix of features with Pandas and plotting with Seaborn
corr_matrix = df_x_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", cbar=True)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
threshold = 0.9
high_corr_pairs = []

for i in range (corr_matrix.shape[0]):
    for j in range (i+1, corr_matrix.shape[0]):
        if abs(corr_matrix[i,j]) > threshold:
            high_corr_pairs.append((i,j))
            
            print(f"Features {feature_names[i]} and {feature_names[j]} have correlation {corr_matrix[i,j]:.2f}")
    

# Test of the algorithms on a linear regression between 2 features

In [None]:
# We observe that features 1 and 2 are correlated
# Let's test our algorithms on these two features only

import numpy as np
from matplotlib import pyplot as plt
from implementations import mean_squared_error_gd, mean_squared_error_sgd, least_squares

tx = np.ones((x_train.shape[0], 2))
tx[:, 1] = x_train[:, 1]

w_gd, _ = mean_squared_error_gd(x_train[:,2], tx, np.array([0., 0.]), max_iters=1000, gamma=0.1)
w_sgd, _ = mean_squared_error_sgd(x_train[:,2], tx, np.array([0., 0.]), max_iters=10000, gamma=0.1)
w_ls, _ = least_squares(x_train[:,2], tx)

print("Weights from GD: ", w_gd)
print("Weights from SGD: ", w_sgd)
print("Weights from LS: ", w_ls)

plt.scatter(x_train[:, 1], x_train[:, 2], alpha=0.2)
plt.plot(x_train[:, 1], tx @ w_gd, label='GD', color='orange')
plt.plot(x_train[:, 1], tx @ w_sgd, label='SGD', color='green')
plt.plot(x_train[:, 1], tx @ w_ls, label='LS', color='red', linestyle='dashed')
plt.legend()