In [1]:
import os
os.chdir('c:\\Users\\utilisateur\\Documents\\GitHub\\ML_Project1\\ML_project1')
         
import sys
sys.path.append(os.getcwd())

In [2]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

import numpy as np
from helpers import *
from Preprocessing_functions import *

## Load Dataset

In [3]:
# Load dataset
file_path = 'dataset\\'
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(file_path)

In [4]:
# Step 3: Load headers from x_train.csv
x_train_headers = extract_headers("dataset\\x_train.csv")

## Identify type of features 

In [5]:
# Initialize counters and lists
binary_count = 0
continuous_count = 0
categorical_count = 0

binary_features = []
continuous_features = []
categorical_features = []

# Threshold for distinguishing categorical from continuous
categorical_threshold = 10

# Analyze each feature column-wise
for col_idx in range(x_train.shape[1]):
    column_data = x_train[:, col_idx]
    
    # Remove missing or empty values to avoid counting them
    column_data = column_data[~np.isnan(column_data)]  # Remove NaN values

    # Get unique values
    unique_values = np.unique(column_data)
    unique_count = len(unique_values)

    # Binary feature: 2 unique values
    if unique_count == 2 and set(unique_values) == {0, 1}:
        binary_count += 1
        binary_features.append(x_train_headers[col_idx])
    
    # Continuous feature: more than categorical_threshold unique values
    elif unique_count > categorical_threshold:
        continuous_count += 1
        continuous_features.append(x_train_headers[col_idx])
    
    # Categorical feature: 3-10 unique values (or another condition you define)
    else:
        categorical_count += 1
        categorical_features.append(x_train_headers[col_idx])

# Output the counts and the feature types
print(f"Binary features: {binary_count}")
print(f"Continuous features: {continuous_count}")
print(f"Categorical features: {categorical_count}")

# Optionally, display the lists of features by type
print("Binary features:", binary_features)
print("Continuous features:", continuous_features)
print("Categorical features:", categorical_features)

Binary features: 4
Continuous features: 102
Categorical features: 215
Binary features: ['_FRTRESP', '_VEGRESP', '_FRT16', '_VEG23']
Continuous features: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'SEQNO', '_PSU', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'HHADULT', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'DIABAGE2', 'CHILDREN', 'WEIGHT2', 'HEIGHT3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5', 'MAXDRNKS', 'FRUITJU1', 'FRUIT1', 'FVBEANS', 'FVGREEN', 'FVORANG', 'VEGETAB1', 'EXRACT11', 'EXEROFT1', 'EXERHMM1', 'EXRACT21', 'EXEROFT2', 'EXERHMM2', 'STRENGTH', 'JOINPAIN', 'FLSHTMY2', 'IMFVPLAC', 'HIVTSTD3', 'WHRTST10', 'BLDSUGAR', 'FEETCHK2', 'DOCTDIAB', 'CHKHEMO3', 'FEETCHK', 'CRGVREL1', 'CRGVPRB1', 'LONGWTCH', 'ASTHMAGE', 'ASDRVIST', 'ASRCHKUP', 'ASACTLIM', 'SCNTWRK1', 'SCNTLWK1', 'ADPLEASR', 'ADDOWN', 'ADSLEEP', 'ADENERGY', 'ADEAT1', 'ADFAIL', 'ADTHINK', 'ADMOVE', '_STSTR', '_STRWT', '_RAWRAKE', '_WT2RAKE', '_CLLCPWT', '_DUALCOR', '_LLCPWT', '_AGEG5YR', '_AGE80', 'HTIN4', 'HTM4', 'WTKG3', '_BMI5', 'DROCDY3

In [6]:
# Initialize removal log to track removed features in each step
removal_log = []

## A. Remove Features With Too Many Missing Values

In [7]:
# Remove Missing Values 
valid_columns, x_train = remove_missing_values(x_train_headers, x_train)
x_train_headers = [x_train_headers[i] for i in valid_columns]  # Update headers for x_train

# Apply the same missing values filter to x_test
x_test = x_test[:, valid_columns]  # Keep the same columns as in x_train for consistency


Step 1: Remove Missing Values - Reason: Too many missing values:
Removed 178 features: ['CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES', 'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1', 'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE', 'HHADULT', 'POORHLTH', 'BPMEDS', 'ASTHNOW', 'DIABAGE2', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'PREGNANT', 'SMOKDAY2', 'STOPSMK2', 'LASTSMK2', 'AVEDRNK2', 'DRNK3GE5', 'MAXDRNKS', 'EXRACT11', 'EXEROFT1', 'EXERHMM1', 'EXRACT21', 'EXEROFT2', 'EXERHMM2', 'LMTJOIN3', 'ARTHDIS2', 'ARTHSOCL', 'JOINPAIN', 'FLSHTMY2', 'IMFVPLAC', 'HIVTSTD3', 'WHRTST10', 'PDIABTST', 'PREDIAB1', 'INSULIN', 'BLDSUGAR', 'FEETCHK2', 'DOCTDIAB', 'CHKHEMO3', 'FEETCHK', 'EYEEXAM', 'DIABEYE', 'DIABEDU', 'CAREGIV1', 'CRGVREL1', 'CRGVLNG1', 'CRGVHRS1', 'CRGVPRB1', 'CRGVPERS', 'CRGVHOUS', 'CRGVMST2', 'CRGVEXPT', 'VIDFCLT2', 'VIREDIF3', 'VIPRFVS2', 'VINOCRE2', 'VIEYEXM2', 'VIINSUR2', 'VICTRCT4', 'VIGLUMA2', 'VIMACDG2', 'CIMEMLOS', 'CDHOUSE', 'CDASSIST', 'CDHEL

## B. Apply Hot-Encoding To Transform Categorical Features Into Binary Features

In [8]:
# Apply one-hot encoding to x_train
categorical_columns_filtered = [i for i, header in enumerate(x_train_headers) if header in categorical_features]
x_train_encoded, x_train_encoded_headers = one_hot_encode(x_train, x_train_headers, categorical_columns_filtered)

# Apply one-hot encoding to x_test with the same categorical columns
x_test_encoded, _ = one_hot_encode(x_test, x_train_headers, categorical_columns_filtered)  # Use x_train_headers for consistent encoding

# Verify One-Hot Encoding
# Get the indices for original binary features
original_binary_indices = [i for i, header in enumerate(x_train_encoded_headers) if header in binary_features]

# Get the indices for one-hot encoded features
one_hot_encoded_indices = [i for i, header in enumerate(x_train_encoded_headers) if "_encoded" in header]

# Concatenate both lists to form the full list of binary column indices
binary_columns_indices = original_binary_indices + one_hot_encoded_indices

# Now use the combined binary_columns_indices to verify the dataset
verify_one_hot_encoding(x_train_encoded, binary_columns_indices)

All checks passed: The dataset has only binary or continuous features.


True

## C. Apply Mean Imputation

In [9]:
# Identify binary and continuous columns
binary_columns_indices = [i for i, header in enumerate(x_train_encoded_headers) if "_encoded" in header or header in binary_features]
continuous_columns_indices = [i for i in range(x_train_encoded.shape[1]) if i not in binary_columns_indices]

# Apply Mean Imputation on the encoded dataset
x_train_mean_imputed = mean_imputation(x_train_encoded.copy(), binary_columns=binary_columns_indices)
x_test_mean_imputed = mean_imputation(x_test_encoded.copy(), binary_columns=binary_columns_indices)

## D. Apply Variance Threshold

In [10]:
# 4.5 - Variance Thresholding (Step 4)
valid_columns, x_train_variance_filtered = variance_thresholding(x_train_encoded_headers, x_train_mean_imputed)
x_train_variance_filtered_headers = [x_train_encoded_headers[i] for i in valid_columns]

# Apply variance threshold to x_test to keep the same features
x_test_variance_filtered = x_test_mean_imputed[:, valid_columns]


Step 2: Variance Thresholding - Reason: Low variance:
Removed 101 features: ['GENHLTH_7.0_encoded', 'GENHLTH_9.0_encoded', 'HLTHPLN1_7.0_encoded', 'HLTHPLN1_9.0_encoded', 'PERSDOC2_7.0_encoded', 'PERSDOC2_9.0_encoded', 'MEDCOST_7.0_encoded', 'MEDCOST_9.0_encoded', 'CHECKUP1_8.0_encoded', 'CHECKUP1_9.0_encoded', 'BPHIGH4_2.0_encoded', 'BPHIGH4_4.0_encoded', 'BPHIGH4_7.0_encoded', 'BPHIGH4_9.0_encoded', 'BLOODCHO_9.0_encoded', 'CHOLCHK_9.0_encoded', 'TOLDHI2_7.0_encoded', 'TOLDHI2_9.0_encoded', 'CVDSTRK3_7.0_encoded', 'CVDSTRK3_9.0_encoded', 'ASTHMA3_7.0_encoded', 'ASTHMA3_9.0_encoded', 'CHCSCNCR_7.0_encoded', 'CHCSCNCR_9.0_encoded', 'CHCOCNCR_7.0_encoded', 'CHCOCNCR_9.0_encoded', 'CHCCOPD1_7.0_encoded', 'CHCCOPD1_9.0_encoded', 'HAVARTH3_7.0_encoded', 'HAVARTH3_9.0_encoded', 'ADDEPEV2_7.0_encoded', 'ADDEPEV2_9.0_encoded', 'CHCKIDNY_7.0_encoded', 'CHCKIDNY_9.0_encoded', 'DIABETE3_2.0_encoded', 'DIABETE3_7.0_encoded', 'DIABETE3_9.0_encoded', 'MARITAL_9.0_encoded', 'EDUCA_1.0_encoded', 'ED

## E. Apply Correlation Analysis on Dataset Obtained After Variance Threshold

In [11]:
correlation_valid_columns, x_train_correlation_filtered = correlation_analysis(x_train_variance_filtered_headers, x_train_variance_filtered, y_train)
x_train_correlation_filtered_headers = [x_train_variance_filtered_headers[i] for i in correlation_valid_columns]

# Apply correlation analysis to x_test to keep the same features
x_test_correlation_filtered = x_test_variance_filtered[:, correlation_valid_columns]


Step 3: Correlation Analysis - Reason: Low or high correlation:
Removed 224 features: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR_2015.0_encoded', 'IYEAR_2016.0_encoded', 'DISPCODE_1100.0_encoded', 'DISPCODE_1200.0_encoded', 'GENHLTH_3.0_encoded', 'MENTHLTH', 'HLTHPLN1_1.0_encoded', 'HLTHPLN1_2.0_encoded', 'PERSDOC2_1.0_encoded', 'MEDCOST_1.0_encoded', 'MEDCOST_2.0_encoded', 'CHECKUP1_2.0_encoded', 'CHECKUP1_3.0_encoded', 'CHECKUP1_4.0_encoded', 'CHECKUP1_7.0_encoded', 'BLOODCHO_7.0_encoded', 'CHOLCHK_4.0_encoded', 'CHOLCHK_7.0_encoded', 'ASTHMA3_1.0_encoded', 'DIABETE3_4.0_encoded', 'MARITAL_1.0_encoded', 'MARITAL_2.0_encoded', 'MARITAL_4.0_encoded', 'MARITAL_6.0_encoded', 'EDUCA_2.0_encoded', 'EDUCA_3.0_encoded', 'EDUCA_4.0_encoded', 'EDUCA_5.0_encoded', 'RENTHOM1_1.0_encoded', 'RENTHOM1_2.0_encoded', 'RENTHOM1_3.0_encoded', 'EMPLOY1_2.0_encoded', 'EMPLOY1_3.0_encoded', 'EMPLOY1_4.0_encoded', 'EMPLOY1_5.0_encoded', 'EMPLOY1_6.0_encoded', 'INCOME2_1.0_encoded', 'INCOME2_3.

## F. Apply Statistical Test on Dataset Obtained After Variance Threshold

In [12]:
statistical_valid_columns, x_train_final = statistical_test_analysis(x_train_variance_filtered_headers, x_train_variance_filtered, y_train)
x_train_final_headers = [x_train_variance_filtered_headers[i] for i in statistical_valid_columns]

# Apply the final selection to x_test to match x_train
x_test_final = x_test_variance_filtered[:, statistical_valid_columns]


Step 4: Statistical Test Analysis - Reason: Not statistically significant:
Removed 320 features: ['IYEAR_2015.0_encoded', 'IYEAR_2016.0_encoded', 'DISPCODE_1100.0_encoded', 'DISPCODE_1200.0_encoded', 'GENHLTH_1.0_encoded', 'GENHLTH_2.0_encoded', 'GENHLTH_3.0_encoded', 'GENHLTH_4.0_encoded', 'GENHLTH_5.0_encoded', 'HLTHPLN1_1.0_encoded', 'HLTHPLN1_2.0_encoded', 'PERSDOC2_1.0_encoded', 'PERSDOC2_2.0_encoded', 'PERSDOC2_3.0_encoded', 'MEDCOST_1.0_encoded', 'MEDCOST_2.0_encoded', 'CHECKUP1_1.0_encoded', 'CHECKUP1_2.0_encoded', 'CHECKUP1_3.0_encoded', 'CHECKUP1_4.0_encoded', 'CHECKUP1_7.0_encoded', 'BPHIGH4_1.0_encoded', 'BPHIGH4_3.0_encoded', 'BLOODCHO_1.0_encoded', 'BLOODCHO_2.0_encoded', 'BLOODCHO_7.0_encoded', 'CHOLCHK_1.0_encoded', 'CHOLCHK_2.0_encoded', 'CHOLCHK_3.0_encoded', 'CHOLCHK_4.0_encoded', 'CHOLCHK_7.0_encoded', 'TOLDHI2_1.0_encoded', 'TOLDHI2_2.0_encoded', 'CVDSTRK3_1.0_encoded', 'CVDSTRK3_2.0_encoded', 'ASTHMA3_1.0_encoded', 'ASTHMA3_2.0_encoded', 'CHCSCNCR_1.0_encoded', '

## Take Features Selected Either By Correlation Analysis Or Statistical Methods

In [13]:
# Combine the results (union of both selected column indices)
combined_valid_columns = sorted(set(correlation_valid_columns) | set(statistical_valid_columns))

# Filter x_train and x_test using the combined column indices
x_train_final = x_train_variance_filtered[:, combined_valid_columns]
x_train_final_headers = [x_train_variance_filtered_headers[i] for i in combined_valid_columns]

x_test_final = x_test_variance_filtered[:, combined_valid_columns]

## Summary

In [14]:
# Display the final results
print("Final x_train shape:", x_train_final.shape)
print("Final x_test shape:", x_test_final.shape)
print("Final selected features:", x_train_final_headers)

Final x_train shape: (328135, 172)
Final x_test shape: (109379, 172)
Final selected features: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'SEQNO', '_PSU', 'GENHLTH_1.0_encoded', 'GENHLTH_2.0_encoded', 'GENHLTH_4.0_encoded', 'GENHLTH_5.0_encoded', 'PHYSHLTH', 'MENTHLTH', 'PERSDOC2_2.0_encoded', 'PERSDOC2_3.0_encoded', 'CHECKUP1_1.0_encoded', 'BPHIGH4_1.0_encoded', 'BPHIGH4_3.0_encoded', 'BLOODCHO_1.0_encoded', 'BLOODCHO_2.0_encoded', 'CHOLCHK_1.0_encoded', 'CHOLCHK_2.0_encoded', 'CHOLCHK_3.0_encoded', 'TOLDHI2_1.0_encoded', 'TOLDHI2_2.0_encoded', 'CVDSTRK3_1.0_encoded', 'CVDSTRK3_2.0_encoded', 'ASTHMA3_2.0_encoded', 'CHCSCNCR_1.0_encoded', 'CHCSCNCR_2.0_encoded', 'CHCOCNCR_1.0_encoded', 'CHCOCNCR_2.0_encoded', 'CHCCOPD1_1.0_encoded', 'CHCCOPD1_2.0_encoded', 'HAVARTH3_1.0_encoded', 'HAVARTH3_2.0_encoded', 'ADDEPEV2_1.0_encoded', 'ADDEPEV2_2.0_encoded', 'CHCKIDNY_1.0_encoded', 'CHCKIDNY_2.0_encoded', 'DIABETE3_1.0_encoded', 'DIABETE3_3.0_encoded', 'SEX_1.0_encoded', 'SEX_2.0_encoded',