In [1]:
from helpers import *
from implementations import *
from preprocessing import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
#data_folder = './data/'
#data_folder = 'C:/Users/ACER/OneDrive - epfl.ch/Desktop/ML/dataset/'
data_folder = "C:/Users/plane/OneDrive/Bureau/MilaLyon/MilaLyon/data/"
#data_folder = "C:/Users/ACER/OneDrive - epfl.ch/Desktop/ML/MilaLyon/data/"

In [3]:
csv_data, zero_values, default_values = load_csv_data(data_folder, max_rows=10000, dictionnary=True)

#Replace zeros and default values before preprocessing
replace_default_with_nan(csv_data['x_train'], csv_data['x_test'], csv_data['feature_names'], default_values)
replace_by_zero(csv_data['x_train'], csv_data['x_test'], zero_values)

def print_shapes(data):
    for key, value in data.items():
        print(f"{key}: {type(value)} with shape {value.shape if isinstance(value, np.ndarray) else 'N/A'}")
print_shapes(csv_data)


x_train: <class 'numpy.ndarray'> with shape (10000, 321)
x_test: <class 'numpy.ndarray'> with shape (10000, 321)
y_train: <class 'numpy.ndarray'> with shape (10000,)
train_ids: <class 'numpy.ndarray'> with shape (10000,)
test_ids: <class 'numpy.ndarray'> with shape (10000,)
feature_names: <class 'numpy.ndarray'> with shape (321,)
useless: <class 'numpy.ndarray'> with shape (321,)
health_related: <class 'numpy.ndarray'> with shape (321,)
better_elsewhere: <class 'numpy.ndarray'> with shape (321,)
bad_format_no_better: <class 'numpy.ndarray'> with shape (321,)
binary: <class 'numpy.ndarray'> with shape (321,)
one_hot: <class 'numpy.ndarray'> with shape (321,)


In [4]:
def preprocess_data(data, nan_drop_threshold=0.2, correlation_threshold=0.02, n_std=3, only_health_related=True):

    # Identify and drop features with many missing values
    nan_features = identify_too_many_missing(data["x_train"], data["feature_names"], threshold=nan_drop_threshold)
    drop_features_from_dictionnary(data, nan_features)
    print(len(nan_features), "features with too many missing values dropped.")

    # Replace remaining NaNs with either mean or most frequent value
    replace_nan(data["x_train"], data["x_test"])

    # Keep only health-related features if specified
    if only_health_related:
        non_health_features = data['feature_names'][~data['health_related']].tolist()
        drop_features_from_dictionnary(data, non_health_features)
        print(len(non_health_features), "non health-related features dropped.")

    # One-hot encode categorical features
    n_features_before = data['x_train'].shape[1]
    one_hot_encode(data)
    n_features_after = data['x_train'].shape[1]
    print(f"One-hot encoding completed. Number of features increased from {n_features_before} to {n_features_after}.")

    # Identify and drop features with low correlation to the target
    low_corr_features, _ = identify_low_correlation(data["x_train"], data["y_train"], data["feature_names"], threshold=correlation_threshold)
    drop_features_from_dictionnary(data, low_corr_features)
    print(len(low_corr_features), "features with low correlation dropped.")

    #Clip outliers
    clip_outliers(data['x_train'], data['x_test'], n_std=n_std)

    #Normalize features
    data['x_train'], data['x_test'] = min_max_normalize(data['x_train'], data['x_test'])


In [5]:
data_copy = dict(csv_data)
preprocess_data(data_copy, nan_drop_threshold=0.9, correlation_threshold=0.01, n_std=3, only_health_related=False)

99 features with too many missing values dropped.
One-hot encoding completed. Number of features increased from 222 to 493.


  c /= stddev[:, None]
  c /= stddev[None, :]


169 features with low correlation dropped.
Clipped 61283 values in x_train (1.89% of all entries)
Clipped 62782 values in x_test (1.94%)


In [6]:
print_shapes(data_copy)

x_train: <class 'numpy.ndarray'> with shape (10000, 324)
x_test: <class 'numpy.ndarray'> with shape (10000, 324)
y_train: <class 'numpy.ndarray'> with shape (10000,)
train_ids: <class 'numpy.ndarray'> with shape (10000,)
test_ids: <class 'numpy.ndarray'> with shape (10000,)
feature_names: <class 'numpy.ndarray'> with shape (324,)
useless: <class 'numpy.ndarray'> with shape (324,)
health_related: <class 'numpy.ndarray'> with shape (324,)
better_elsewhere: <class 'numpy.ndarray'> with shape (324,)
bad_format_no_better: <class 'numpy.ndarray'> with shape (324,)
binary: <class 'numpy.ndarray'> with shape (324,)
one_hot: <class 'numpy.ndarray'> with shape (324,)
