In [None]:
from helpers import *
from implementations import *
import numpy as np
import matplotlib.pyplot as plt
import os
from missing_codes import DEFAULT_MISSING, EXCEPTIONS
import numpy as np

In [None]:
data_folder = 'C:/Users/ACER/OneDrive - epfl.ch/Desktop/ML/dataset/'
x_train, x_test, y_train, train_ids, test_ids, train_columns, test_columns = load_csv_data(data_folder, sub_sample=True)

In [None]:
x_train_clean = replace_missing(x_train, DEFAULT_MISSING, EXCEPTIONS)
x_test_clean = replace_missing(x_test, DEFAULT_MISSING, EXCEPTIONS)

In [None]:
print("Number of NaNs in x:", np.isnan(x_test).sum())
print("Number of NaNs in x_test:", np.isnan(x_test_clean).sum())


In [None]:
x_train.shape, x_test.shape, y_train.shape


In [None]:
x_train_reduced, x_test_reduced, kept_cols = drop_too_many_missing(x_train_clean, x_test_clean, train_columns,threshold=0.9)


In [None]:
x_train_imputed, x_test_imputed = mean_imputation(x_train_reduced, x_test_reduced, np.array(train_columns)[kept_cols])


In [None]:
x_train_norm, x_test_norm = min_max_normalize(x_train_imputed, x_test_imputed)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Compute correlation matrix (rows=features)
corr = np.corrcoef(x_train_norm, rowvar=False)

# Plot heatmap
plt.figure(figsize=(10, 8))
im = plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar(im, fraction=0.046, pad=0.04, label="Correlation")
plt.xticks([])
plt.yticks([])
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

# Optionally print highly correlated pairs
threshold = 0.9
print(f"\nHighly correlated features (|corr| > {threshold}):")
for i in range(corr.shape[0]):
    for j in range(i + 1, corr.shape[1]):
        if abs(corr[i, j]) > threshold:
            print(f"  {train_columns[i]} ↔ {train_columns[j]} : {corr[i, j]:.2f}")


In [None]:
x_train_final, x_test_final, kept_cols, dropped_features = drop_highly_correlated(
    x_train_norm,
    x_test_norm,
    feature_names=train_columns,
    threshold=0.9
)


In [None]:
x_train_final.shape

In [None]:
import numpy as np

def clip_outliers(x_train, x_test=None, n_std=3):
    """
    Clips outliers to within mean ± n_std * std for each feature.
    Reports how many values were clipped.

    Args:
        x_train (np.array): shape=(N,D) training feature matrix
        x_test (np.array, optional): shape=(M,D) test feature matrix
        n_std (float): number of standard deviations for clipping

    Returns:
        tuple:
            x_train_clipped (np.array): clipped training data
            x_test_clipped (np.array or None): clipped test data (if provided)
            n_clipped (int): number of values clipped in training set
    """
    mean = np.mean(x_train, axis=0)
    std = np.std(x_train, axis=0)
    
    clip_min = mean - n_std * std
    clip_max = mean + n_std * std

    # Count values to be clipped (before applying np.clip)
    below_min = x_train < clip_min
    above_max = x_train > clip_max
    n_clipped = np.sum(below_min | above_max)

    # Apply clipping
    x_train_clipped = np.clip(x_train, clip_min, clip_max)

    print(f"Clipped {n_clipped} values in x_train "
          f"({n_clipped / x_train.size * 100:.2f}% of all entries)")

    if x_test is not None:
        below_min_test = x_test < clip_min
        above_max_test = x_test > clip_max
        n_clipped_test = np.sum(below_min_test | above_max_test)
        x_test_clipped = np.clip(x_test, clip_min, clip_max)
        print(f"Clipped {n_clipped_test} values in x_test "
              f"({n_clipped_test / x_test.size * 100:.2f}%)")
        return x_train_clipped, x_test_clipped, n_clipped, n_clipped_test

    return x_train_clipped, n_clipped


In [None]:
def pca_reduce(x_train, x_test=None, variance_threshold=0.95):
    """
    Perform PCA and reduce dimensionality to preserve given variance.

    Args:
        x_train (np.array): training data, shape (N, D)
        x_test (np.array, optional): test data, shape (M, D)
        variance_threshold (float): fraction of variance to keep (e.g. 0.95)

    Returns:
        x_train_pca (np.array)
        x_test_pca (np.array or None)
        eigvecs (np.array): principal component directions
        explained_variance (np.array): explained variance ratio per component
    """
  

    # Covariance
    cov = np.cov(x_train, rowvar=False)

    # Eigen decomposition
    eigvals, eigvecs = np.linalg.eigh(cov)

    # Sort by descending eigenvalue
    idx = np.argsort(eigvals)[::-1]
    eigvals, eigvecs = eigvals[idx], eigvecs[:, idx]

    # Compute explained variance
    explained_variance = eigvals / np.sum(eigvals)
    cumulative_variance = np.cumsum(explained_variance)

    # Determine number of components
    k = np.searchsorted(cumulative_variance, variance_threshold) + 1
    print(f"Keeping {k} components explaining {cumulative_variance[k-1]*100:.2f}% variance")

    # Project data
    x_train_pca = np.dot(x_test, eigvecs[:, :k])

    if x_test is not None:
        
        x_test_pca = np.dot(x_test, eigvecs[:, :k])
        return x_train_pca, x_test_pca, eigvecs[:, :k], explained_variance[:k]

    return x_train_pca, eigvecs[:, :k], explained_variance[:k]


In [None]:
x_train_pca, x_test_pca, eigvecs, var_ratio = pca_reduce(
    x_train_norm,
    x_test_norm,
    variance_threshold=0.95
)


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, len(var_ratio) + 1), var_ratio, 'o-', linewidth=2)
plt.title("Explained Variance per Component")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
cumulative_variance = np.cumsum(var_ratio)

plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, len(cumulative_variance) + 1), cumulative_variance, 'o-', linewidth=2)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% variance threshold')
plt.title("Cumulative Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
y_train_bin = (y_train == 1).astype(int)

In [None]:
max_iters = 1000      # number of gradient descent steps
gamma = 0.01          # learning rate
initial_w = np.zeros(x_train_pca.shape[1])

In [None]:
w, loss = logistic_regression(y_train_bin, x_train_pca, initial_w, max_iters, gamma)
print(f"Final training loss: {loss:.4f}")

In [None]:
def predict_labels(tx, w, threshold=0.5):
    pred = sigmoid(tx @ w)
    return (pred >= threshold).astype(int)

In [None]:
y_pred_train = predict_labels(x_train_pca, w)

def compute_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

acc_train = compute_accuracy(y_train_bin, y_pred_train)

print(f"Training accuracy: {acc_train*100:.2f}%")


In [None]:
import pandas as pd

# Read CSV
df = pd.read_csv(os.path.join(data_folder, "x_train.csv"))

## Test of the algorithms on a linear regression between 2 features

In [None]:
# We observe that features 1 and 2 are correlated
# Let's test our algorithms on these two features only

import numpy as np
from matplotlib import pyplot as plt
from implementations import mean_squared_error_gd, mean_squared_error_sgd, least_squares

tx = np.ones((x_train.shape[0], 2))
tx[:, 1] = x_train[:, 1]

w_gd, _ = mean_squared_error_gd(x_train[:,2], tx, np.array([0., 0.]), max_iters=100, gamma=0.1)
w_sgd, _ = mean_squared_error_sgd(x_train[:,2], tx, np.array([0., 0.]), max_iters=10000, gamma=0.1)
w_ls, _ = least_squares(x_train[:,2], tx)

print("Weights from GD: ", w_gd)
print("Weights from SGD: ", w_sgd)
print("Weights from LS: ", w_ls)

plt.scatter(x_train[:, 1], x_train[:, 2], alpha=0.2)
plt.plot(x_train[:, 1], tx @ w_gd, label='GD', color='orange')
plt.plot(x_train[:, 1], tx @ w_sgd, label='SGD', color='green')
plt.plot(x_train[:, 1], tx @ w_ls, label='LS', color='red', linestyle='dashed')
plt.legend()

# Preprocessing


Dealing with NaN values, correlated columns, categoric variable.
Handle outliers, encode categorical, feature normalization.


In [None]:
non_nan_mask = ~np.isnan(x_train).any(axis = 0)
non_nan_indices = np.where(non_nan_mask)[0]

print("Indices of features with no NaN values:", non_nan_indices)
print("Number of features without NaNs out of 321:", non_nan_mask.sum())

### Mean Imputation

In [None]:
x_train_imp, x_test_imp = mean_imputation(x_train, x_test, train_columns)


In [None]:
non_nan_mask = ~np.isnan(x_train_imp).any(axis = 0)
non_nan_indices = np.where(non_nan_mask)[0]

print("Indices of features with no NaN values:", non_nan_indices)
print("Number of features without NaNs out of 321:", non_nan_mask.sum())
print("Number of dropped features", 321- non_nan_mask.sum())

In [None]:
x_train_norm, x_test_norm = normalize(x_train_imp, x_test_imp)

In [None]:
# Make boxplot with colors
box = plt.boxplot(x_train_norm, patch_artist=True, notch=True, widths=0.6)



# Labels
plt.title("Boxplot of All Features (Normalized)", fontsize=16)
plt.xlabel("Feature Index")
plt.ylabel("Value")

# Add grid
plt.grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()


In [None]:
plt.hist(x_train_norm[:, 0], bins=30)
plt.title("Feature 0 distribution after normalization")
plt.show()


In [None]:
corr_matrix = np.corrcoef(x_train_norm, rowvar=False)  # shape (D, D)

# Step 2: Plot heatmap
plt.figure(figsize=(8, 6))
plt.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Correlation')
plt.title("Feature Correlation Heatmap")
plt.xlabel("Features")
plt.ylabel("Features")



In [None]:
threshold = 0.9
high_corr_pairs = []

for i in range (corr_matrix.shape[0]):
    for j in range (i+1, corr_matrix.shape[0]):
        if abs(corr_matrix[i,j]) > threshold:
            high_corr_pairs.append((i,j))
            
            print(f"Features {train_columns[i]} and {train_columns[j]} have correlation {corr_matrix[i,j]:.2f}")
    

In [None]:
np.sum(np.abs(corr_matrix == 1))

In [None]:
one_corr_pairs = []

for i in range (corr_matrix.shape[0]):
    for j in range (i+1, corr_matrix.shape[0]):
        if abs(corr_matrix[i,j]) > 0.99:
            one_corr_pairs.append((i,j))
            
            print(f"Features {train_columns[i]} and {train_columns[j]} have correlation {corr_matrix[i,j]:.2f}")
    

## Dictionary

I'm arrived till index 145 (ASTHMAGE) which needs to be done.

In [None]:
train_columns[293]

In [None]:
# Default missing codes for all features
DEFAULT_MISSING = [7, 9]

EXCEPTIONS = {}

for i in range(0, 24):
    EXCEPTIONS[i] = []
    
for i in [216,217,218, 248, 249, 250, 251, 253, 254, 266, 267, 268, 269, 270,271, 276, 277, 285, 286, 291, 292, 295, 296, 299, 300, 301, 302, 303, 304]:
    EXCEPTIONS[i] = []

for i in [252]:
    EXCEPTIONS[i] = [99999] # don't know
    
for i in [262]:
    EXCEPTIONS[i] = [900] # don't know

for i in [264, 287, 288, 293, 294, 297]:
    EXCEPTIONS[i] = [99900] # don't know 

for i in [242]:
    EXCEPTIONS[i] = [9] # don't know
    
for i in [246]:
    EXCEPTIONS[i] = [14] # don't know
    
for i in [247]:
    EXCEPTIONS[i] = [3] # don't know
       
for i in [88]:
    EXCEPTIONS[i] = [98,77, 99] # 98 other, 77 don't know, 99 refused
    
for i in [60, 78, 80, 98, 119, 122, 168, 224, 225,239, 240 ]:
    EXCEPTIONS[i] = [77, 99] # 77 don't know, 99 refused

for i in [27,28,29,79,112, 114, 206, 207, 208, 209, 210, 211, 212, 213]:
    EXCEPTIONS[i] = [77, 88, 99] # 77 don't know, 88 None, 99 refused
    
for i in [33, 58, 99, 115, 118, 132, 151, 152, 153, 154, 192, 193]:
    EXCEPTIONS[i] = [7, 8, 9] # 8 never and not able to work or not applicable
    
for i in [147,148]:
    EXCEPTIONS[i] = [88, 98] # 88 none , 98 don't know
    
for i in [195, 197]:
    EXCEPTIONS[i] = [97, 98, 99] # 97 don't know , 98 zero,  99 refused
        
for i in [49, 145]:
    EXCEPTIONS[i] = [98, 99] # 98 don't know , 99 refused

for i in [59]:
    EXCEPTIONS[i] = [88, 99] # 88 None, 99 refused
    
for i in [62,63]:
    EXCEPTIONS[i] = [7777, 9999] # 7777 don't know, 9999 refused

for i in [75]:
    EXCEPTIONS[i] = [8, 77, 99]  # 8 Never, 77 don't know, 99 refused

for i in [77, 94, 110, 150]:
    EXCEPTIONS[i] = [777, 888, 999] # 777 don't know, 888 no drinks/ never, 999 refused
    
for i in range(81, 87):
    EXCEPTIONS[i] = [555, 777, 999] # 555 Never, 777 don't know, 999 refused

for i in [75, 130]:
    EXCEPTIONS[i] = [8, 77, 99] # 8 Never/ non applicable, 77 don't know, 99 refused
    
for i in [60, 78, 80, 102, 106]:
    EXCEPTIONS[i] = [77, 99] # 77 don't know, 99 refused

for i in [89,90,92, 93]:
    EXCEPTIONS[i] = [777, 999] # 777 don't know, 999 refused

for i in [91, 113]:
    EXCEPTIONS[i] = [88, 98, 77, 99] # 88 No other activity, 98 other/never heard, 77 don't know, 99 refused

for i in [101,105]:
    EXCEPTIONS[i] = [777777, 999999] # 777777 don't know, 999999 refused]   
    
for i in [111, 143]:
    EXCEPTIONS[i] = [555, 777,888, 999] # 555 No feet, 777 don't know, 888 never, 999 refused
    
for i in [127,128]:
    EXCEPTIONS[i] = [7,8] # 7 don't know, 8 not applicable

for i in [129, 137, 138,139, 140]:
    EXCEPTIONS[i] = [5,7,9] # 5 never, 7 don't know, 9 refused 

for i in [131]:
    EXCEPTIONS[i] = [5,7,8,9] # 5 never, 7 don't know, 8 not applicable 9 refused 
    
for i in [148,149]:
    EXCEPTIONS[i] = [88, 98, 99] # 88 none, 98 don't know , 99 refused

In [None]:
def replace_missing_with_exceptions(x, feature_index, EXCEPTIONS):
    """
    Replace default missing codes and any feature-specific exceptions with NaN
    """
    x = x.astype(float)
    
    # Start with default missing codes
    codes_to_replace = list(DEFAULT_MISSING)
    
    # Add feature-specific codes if present
    if feature_index in EXCEPTIONS:
        codes_to_replace.extend(EXCEPTIONS[feature_index])
    
    # Replace with NaN
    for code in codes_to_replace:
        x[x == code] = np.nan
        
    return x


In [None]:
def preprocess_dataset(X, EXCEPTIONS):
    """
    Clean the full dataset using DEFAULT_MISSING and feature-specific EXCEPTIONS
    """
    n_features = X.shape[1]
    X_clean = np.empty_like(X, dtype=float)
    
    for j in range(n_features):
        X_clean[:, j] = replace_missing_with_exceptions(X[:, j], j, EXCEPTIONS)
    
    return X_clean


## Cleaning Test

IDATE can be discarded (we have IDAY, IMONTH, IYEAR).
LADULT	NUMADULT	NUMMEN	NUMWOMEN are continuos (9 has a value but don't display a lot)

In [None]:

SPECIAL_CODES = [7, 9, 77, 99, 777, 999]

In [None]:
def detect_feature_type(x, cat_threshold=11):
    """
    Automatically detect feature type.
    - x: 1D numpy array (feature column)
    - cat_threshold: maximum number of unique valid values to consider categorical
    Returns: 'categorical' or 'continuous'
    """
    x = x.astype(float)
    # Step 1: separate special codes and valid values
    valid_mask = ~np.isin(x, SPECIAL_CODES) & ~np.isnan(x)
    valid_vals = x[valid_mask]
    
    # Step 2: decide based on number of unique valid values
    if len(np.unique(valid_vals)) <= cat_threshold:
        return "categorical"
    else:
        return "continuous"


In [None]:
def replace_special_codes_with_nan(x):
    """
    Replace all SPECIAL_CODES in x with np.nan
    """
    x = x.astype(float)  # ensure float so we can assign NaN
    for code in SPECIAL_CODES:
        x[x == code] = np.nan
    return x

In [None]:
def build_feature_dictionary(X, cat_threshold=20):
    """
    Build a dictionary describing each feature:
    - type: categorical or continuous
    - invalid: list of special codes present in the feature
    - X_clean: data with special codes replaced by NaN
    """
    n_features = X.shape[1]
    feature_info = {}
    X_clean = np.empty_like(X, dtype=float)
    
    for j in range(n_features):
        col = X[:, j]
        # Step 1: replace special codes with NaN
        col_clean = replace_special_codes_with_nan(col)
        X_clean[:, j] = col_clean
        
        # Step 2: detect feature type safely
        ftype = detect_feature_type(col_clean, cat_threshold)
        
        # Step 3: detect which special codes were present
        present_codes = [code for code in SPECIAL_CODES if code in col]
        
        # Step 4: store info
        feature_info[j] = {
            "type": ftype,
            "invalid": present_codes
        }
    
    return feature_info, X_clean


In [None]:

feature_info, X_clean = build_feature_dictionary(df.values)

print("Feature types & invalid codes:")
for k, v in feature_info.items():
    print(k, v)

print("\nCleaned dataset:")
print(X_clean)

7,77 don't know \
9,99 refused \
PHYSHLTH, POORHLTH ecc 8, 88 is None (to deal with) \
DIABAGE2 98 is none... \
EMPLOY1 8 is not don't know but is unable to work


In [None]:
feature_rules = {}

for col in range(x_train.shape[1]):
    col_type = detect_feature_type(x_train[:, col])
    
    if col_type == "continuous":
        feature_rules[col] = {"type": "continuous", "invalid": []}
    else:
        feature_rules[col] = {"type": "categorical", "invalid": [7, 9, 77, 99]}

In [None]:
train_columns[145]


### Altro tentativo


In [None]:

# Default missing codes for all features
DEFAULT_MISSING = [7, 9]

EXCEPTIONS = {}

for i in range(0, 24):
    EXCEPTIONS[i] = []

for i in [88]:
    EXCEPTIONS[i] = [98,77, 99] # 98 other, 77 don't know, 99 refused
    
for i in [60, 78, 80, 98, 119, 122]:
    EXCEPTIONS[i] = [77, 99] # 77 don't know, 99 refused

for i in [27,28,29,79,112, 114]:
    EXCEPTIONS[i] = [77, 88, 99] # 77 don't know, 88 None, 99 refused
    
for i in [33, 58, 99, 115, 118, 132]:
    EXCEPTIONS[i] = [7, 8, 9] # 8 never and not able to work
    
for i in range(49, 50):
    EXCEPTIONS[i] = [98, 99] # 98 don't know , 99 refused

for i in [59]:
    EXCEPTIONS[i] = [88, 99] # 88 None, 99 refused
    
for i in [62,63]:
    EXCEPTIONS[i] = [7777, 9999] # 7777 don't know, 9999 refused

for i in [75]:
    EXCEPTIONS[i] = [8, 77, 99]  # 8 Never, 77 don't know, 99 refused

for i in [77, 94, 110]:
    EXCEPTIONS[i] = [777, 888, 999] # 777 don't know, 888 no drinks/ never, 999 refused
    
for i in range(81, 87):
    EXCEPTIONS[i] = [555, 777, 999] # 555 Never, 777 don't know, 999 refused

for i in [75, 130]:
    EXCEPTIONS[i] = [8, 77, 99] # 8 Never/ non applicable, 77 don't know, 99 refused
    
for i in [60, 78, 80, 102, 106]:
    EXCEPTIONS[i] = [77, 99] # 77 don't know, 99 refused

for i in [89,90,92, 93]:
    EXCEPTIONS[i] = [777, 999] # 777 don't know, 999 refused

for i in [91, 113]:
    EXCEPTIONS[i] = [88, 98, 77, 99] # 88 No other activity, 98 other/never heard, 77 don't know, 99 refused

for i in [101,105]:
    EXCEPTIONS[i] = [777777, 999999] # 777777 don't know, 999999 refused]   
    
for i in [111, 143]:
    EXCEPTIONS[i] = [555, 777,888, 999] # 555 No feet, 777 don't know, 888 never, 999 refused
    
for i in [127,128]:
    EXCEPTIONS[i] = [7,8] # 7 don't know, 8 not applicable

for i in [129, 137, 138,139, 140]:
    EXCEPTIONS[i] = [5,7,9] # 5 never, 7 don't know, 9 refused 

for i in [131]:
    EXCEPTIONS[i] = [5,7,8,9] # 5 never, 7 don't know, 8 not applicable 9 refused 


In [None]:
def replace_missing_with_nan(x, feature_index):
    """
    Replace default missing codes and any feature-specific exceptions with NaN
    """
    x = x.astype(float)
    
    # Start with default missing codes
    codes_to_replace = list(DEFAULT_MISSING)
    
    # Add feature-specific codes if present
    if feature_index in EXCEPTIONS:
        codes_to_replace.extend(EXCEPTIONS[feature_index])
    
    # Replace with NaN
    for code in codes_to_replace:
        x[x == code] = np.nan
        
    return x


In [None]:
def preprocess_with_exceptions(X):
    n_features = X.shape[1]
    X_clean = np.empty_like(X, dtype=float)
    
    for j in range(n_features):
        X_clean[:, j] = replace_missing_with_nan(X[:, j], j)
    
    return X_clean
