In [None]:
from helpers import *
from implementations import *
from preprocessing import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
data_folder = './data/'
#data_folder = 'C:/Users/ACER/OneDrive - epfl.ch/Desktop/ML/dataset/'
#data_folder = "C:/Users/plane/OneDrive/Bureau/MilaLyon/data/dataset/"

# Loading data from CSV files

The file `data/default_values.csv` contains information about each feature

* **feature**: name of the feature

* **Value for zero**: value to replace missing values if the feature is numerical and the missing values are to be replaced by zero (ex: for `CHILDREN` 88 means 0 children)

* **Combination of other indicators**: 1 if the feature is just a combination of other features (ex: `_RFHLTH` is 1 if `GENHLTH` = 1, 2 or 3 and 2 if `GENHLTH` = 4 or 5)

* **Health related**: 1 if the feature is health related

* **Bad format, better format elsewhere**: 1 if the feature is in a bad format but parsed in another feature

* **Bad format, no better**: 1 if the feature is in a bad format and not parsed in another feature

* **Values for no response**: values that indicate no response

In [None]:
_data = load_csv_data(data_folder, max_rows=1000)
x_train, x_test, y_train, train_ids, test_ids, feature_names, zero_values, default_values, useless, health_related, better_elsewhere, bad_format_no_better = _data

print("Number of training samples: ", x_train.shape[0]
      , "\nNumber of test samples: ", x_test.shape[0]
      , "\nNumber of features: ", x_train.shape[1])

In [None]:
# Pandas version
# drop first column (ids)
df_x_train = pd.read_csv(data_folder + 'x_train.csv').drop(columns=['Id'])
df_y_train = pd.read_csv(data_folder + 'y_train.csv').drop(columns=['Id'])
df_x_test = pd.read_csv(data_folder + 'x_test.csv').drop(columns=['Id'])

# Pandas version
print(df_x_train.info())
print(df_x_test.info())

# Preprocessing

## Replace default values in dataset by NaN

Custom implementation

In [None]:
print("Default values for _PRACE1:", default_values['_PRACE1'])

print()
print("Row index | Value of _PRACE1 before replacing defaults with NaN")
print("9         |", x_train[9,feature_names == '_PRACE1'])
print("101       |", x_train[101,feature_names == '_PRACE1'])
print("202       |", x_train[202,feature_names == '_PRACE1'])

replace_default_with_nan(x_train, x_test, feature_names, default_values)

print()
print("Row index | Value of _PRACE1 after replacing defaults with NaN")
print("9         |", x_train[9,feature_names == '_PRACE1'])
print("101       |", x_train[101,feature_names == '_PRACE1'])
print("202       |", x_train[202,feature_names == '_PRACE1'])

Pandas implementation

In [None]:
# With Pandas
print()
print("Row index | Value of _PRACE1 before replacing defaults with NaN")
print("9         |", df_x_train.loc[9,'_PRACE1'])
print("101       |", df_x_train.loc[101,'_PRACE1'])
print("202       |", df_x_train.loc[202,'_PRACE1'])

for i, feature in enumerate(feature_names):
    # Replace default values with NaN
    for default_value in default_values[feature]:
        df_x_train.loc[df_x_train[feature] == default_value, feature] = np.nan
        df_x_test.loc[df_x_test[feature] == default_value, feature] = np.nan
        
print()
print("Row index | Value of _PRACE1 after replacing defaults with NaN")
print("9         |", df_x_train.loc[9,'_PRACE1'])
print("101       |", df_x_train.loc[101,'_PRACE1'])
print("202       |", df_x_train.loc[202,'_PRACE1'])

## Identify features type (binary, categorical, continuous)

In [None]:
feature_types = detect_feature_type(x_train)

for i, feature in enumerate(feature_names):
    print(f"{i}: {feature} - {feature_types[i]}", end="")
    if useless[i]:
        print(" (useless)", end="")
    if health_related[i]:
        print(" (health related)", end="")
    if better_elsewhere[i]:
        print(" (better elsewhere)", end="")
    if bad_format_no_better[i]:
        print(" (bad format, no better)", end="")
    if zero_values[feature] != None:
        print(" (zero value:", zero_values[feature], end=")")
    if len(default_values[feature]) > 0:
        print(" (default values:", default_values[feature], end=")")
    print()

## Plot the number of missing values per feature

In [None]:
# Percentage of NaN values per feature
nan_percentage = np.mean(np.isnan(x_train), axis=0) * 100
# Cummulative distribution function of NaN percentages
sorted_nan_percentage = np.sort(nan_percentage)
plt.plot(sorted_nan_percentage, np.arange(len(sorted_nan_percentage), 0, -1))
plt.xlabel('Percentage of NaN values per feature')
plt.ylabel('Number of features with more than x% NaN values')
plt.title('CCDF of NaN percentages per feature')
plt.grid()
plt.show()

In [None]:
# Pandas version
# Plot ccdf of NaN percentages
nan_percentage = df_x_train.isna().mean() * 100
sorted_nan_percentage = np.sort(nan_percentage)
plt.plot(sorted_nan_percentage, np.arange(len(sorted_nan_percentage), 0, -1))
plt.xlabel('Percentage of NaN values per feature')
plt.ylabel('Number of features with more than x% NaN values')
plt.title('CCDF of NaN percentages per feature')
plt.grid()
plt.show()


## Replace missing values by the mean of the feature

In [None]:
mean_imputation(x_train, x_test)
# If using pandas
df_x_train.fillna(df_x_train.mean(), inplace=True)
df_x_test.fillna(df_x_train.mean(), inplace=True)

In [None]:
df_x_train.fillna(df_x_train.mean(), inplace=True)
df_x_test.fillna(df_x_train.mean(), inplace=True)

## Correlation between features

In [None]:
# Compute correlation matrix (each row is an observation, each column a feature)
corr = np.corrcoef(x_train, rowvar=False)

# Plot heatmap
im = plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar(im, fraction=0.046, pad=0.04, label="Correlation")
plt.xticks([])
plt.yticks([])
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Pandas version
# Compute correlation matrix (rows=features)
corr_pd = df_x_train.corr()
# Plot heatmap without axis ticks
sns.heatmap(corr_pd, cmap="coolwarm", vmin=-1, vmax=1, xticklabels=False, yticklabels=False)
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
# Optionally print highly correlated pairs
threshold = 0.9
print(f"\nHighly correlated features (|corr| > {threshold}):")
for i in range(corr.shape[0]):
    for j in range(i + 1, corr.shape[1]):
        if abs(corr[i, j]) > threshold:
            print(f"  {feature_names[i]} ↔ {feature_names[j]} : {corr[i, j]:.2f}")

## Correlation between features and target

In [None]:
corr_with_target = np.array([np.corrcoef(x_train[:, i], y_train)[0, 1] for i in range(x_train.shape[1])])
correlation_ranked = np.argsort(np.abs(corr_with_target))[::-1]

excluded_features = []

for idx in correlation_ranked:
    if not np.isnan(corr_with_target[idx]):
        print(f"{feature_names[idx]}: {corr_with_target[idx]:.4f}")
    else:
        excluded_features.append(feature_names[idx])
print("\nExcluded features due to NaN correlation with target:", excluded_features)

In [None]:
# Pandas version
corr_with_target_pd = df_x_train.apply(lambda col: col.corr(df_y_train['_MICHD']))
correlation_ranked_pd = corr_with_target_pd.abs().sort_values(ascending=False)

excluded_features = []

for feature, corr_value in correlation_ranked_pd.items():
    if np.isnan(corr_value):
        excluded_features.append(feature)
    else:
        print(f"{feature}: {corr_value:.4f}")
        
print("\nExcluded features due to NaN correlation with target:", excluded_features)
    

## Principal Component Analysis (PCA)

In [None]:
x_train_pca, eigvecs, explained_variance = pca_reduce(x_train, variance_threshold=1-1e-6)

plt.bar(np.arange(1, len(explained_variance) + 1), explained_variance)
plt.yscale('log')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.grid()
plt.show()

In [None]:
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, len(cumulative_variance) + 1), cumulative_variance, 'o-', linewidth=2)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% variance threshold')
plt.title("Cumulative Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# using pandas and sklearn
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca_reduce = pca.fit_transform(df_x_train.fillna(df_x_train.mean()))

# Plot the explained variance ratio, y log scale
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.yscale('log')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio of Principal Components')
plt.show()

# Logistic Regression

Still in progress...

In [None]:
y_train_bin = (y_train == 1).astype(int)

# add constant bias feature (column of ones)
const_train = np.ones((x_train.shape[0], 1))
const_test = np.ones((x_test.shape[0], 1))
x_train = np.hstack((x_train, const_train))
x_test = np.hstack((x_test, const_test))

max_iters = 1000      # number of gradient descent steps
gamma = 0.0001       # learning rate

initial_w = np.zeros(x_train.shape[1])
w, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)
print(f"Final training loss: {loss:.4f}")

In [None]:
def predict_labels(tx, w, threshold=0.5):
    pred = sigmoid(tx @ w)
    return (pred >= threshold).astype(int)

In [None]:
y_pred_train = predict_labels(x_train, w)

def compute_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

acc_train = compute_accuracy(y_train_bin, y_pred_train)

print(f"Training accuracy: {acc_train*100:.2f}%")


In [None]:
# Predict on test set and save predictions
y_pred_test = 2*predict_labels(x_test, w) - 1
create_csv_submission(test_ids, y_pred_test, 'logistic_regression_submission.csv')

In [None]:
# Logistic regression with sklearn
from sklearn.linear_model import LogisticRegression

# add constant bias feature (column of ones)
df_x_train['_CONST'] = 1
df_x_test['_CONST'] = 1

model = LogisticRegression(max_iter=1000)
model.fit(df_x_train, df_y_train.values.ravel())

# percentage of correct predictions
y_pred_train = model.predict(df_x_train)
accuracy = np.mean(y_pred_train == df_y_train.values.ravel())
print(f"Training accuracy: {accuracy*100:.2f}%")

In [None]:
# Neural Network with sklearn
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=1000)
model.fit(df_x_train, df_y_train.values.ravel())

# Performance on training set
y_pred_train = model.predict(df_x_train)
accuracy = np.mean(y_pred_train == df_y_train.values.ravel())
print(f"Training accuracy: {accuracy*100:.2f}%")

In [None]:
# SVM with kernel
from sklearn.svm import SVC
model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(df_x_train, df_y_train.values.ravel())
# Performance on training set
y_pred_train = model.predict(df_x_train)
accuracy = np.mean(y_pred_train == df_y_train.values.ravel())
print(f"Training accuracy: {accuracy*100:.2f}%")

In [None]:
# predict on test set
y_pred_test = model.predict(df_x_test)
create_csv_submission(test_ids, y_pred_test, 'logistic_regression_sklearn_submission.csv')