In [1]:
import os
import sys

print(f"Current Working Directory --> {os.getcwd()}")
#Add one directory above research
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) # Get the parent directory
sys.path.append(parent_dir)

from configs import cfgs  # Absolute import

Current Working Directory --> C:\Users\maz\dev\Projects\proj_alzheimer_research\research
C:\Users\maz\dev\Projects\proj_alzheimer_research\configs\cfgs.yaml


In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import zscore
import ipywidgets as widgets
from ipywidgets import interact, Dropdown
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from matplotlib.gridspec import GridSpec

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

sns.set_theme(style="darkgrid")

import sweetviz as sv
print("SweetViz Version : {}".format(sv.__version__))

SweetViz Version : 2.3.1


In [3]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [4]:
# print(cfgs)

## EXPLORATORY DATA ANALYSIS

In [15]:
# Functions used for plots (histograms, scatter plots and correlation matrices)

def histograms(df, selected_uid):
    # Filter the DataFrame based on 'uid'
    uid_data = df[df.index == selected_uid]

    columns_to_plot = np.random.choice(df.columns, size=20, replace=False) # To print all features put this --> [col for col in df.columns]
    
    num_cols = 5
    num_rows = int(np.ceil(len(columns_to_plot) / num_cols))

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 3 * num_rows)) 
    axes = axes.flatten()

    # Plot histograms for each acoustic feature
    for idx, col in enumerate(columns_to_plot):
        sns.histplot(uid_data[col], ax=axes[idx])
        axes[idx].set_title(f"{col}", fontsize=9)
        axes[idx].set_xlabel('')
        axes[idx].set_ylabel('Frequency', fontsize=8)

    # Remove empty axes
    for idx in range(len(columns_to_plot), len(axes)):
        fig.delaxes(axes[idx])

    plt.tight_layout(pad=1.5)
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    plt.show()

def scatter_plot(df, selected_index):
    # Filter data for the selected index
    uid_data = df[df.index == selected_index]
    
    columns_to_plot = np.random.choice(df.columns, size=20, replace=False) # To print all features put this --> [col for col in df.columns]
    
    num_cols = 5  
    num_rows = int(np.ceil(len(columns_to_plot) / num_cols))
    
    # Create subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 3 * num_rows))
    axes = axes.flatten()  
    
    # Generate scatterplots
    for idx, col in enumerate(columns_to_plot):
        sns.scatterplot(data=uid_data, x=range(len(uid_data)), y=uid_data[col], ax=axes[idx])
        axes[idx].set_title(f"{col}", fontsize=9)
        axes[idx].set_xlabel('Index')
        axes[idx].set_ylabel('Value', fontsize=8)
    
    for idx in range(len(columns_to_plot), len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout(pad=1.5)
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    plt.show()

def correlation_matrix(df, selected_index):
    # Filter the DataFrame by the selected index
    filtered_df = df[df.index == selected_index]
    
    # Check if there are enough rows for correlation
    if filtered_df.shape[0] <= 1:
        print(f"Not enough data for index '{selected_index}' to calculate correlations.")
        return
    
    # Calculate the correlation matrix
    correlation_matrix = filtered_df.corr()

    plt.figure(figsize=(22, 22))  
    sns.heatmap(
        correlation_matrix, annot=False, cmap='coolwarm', center=0, 
        cbar=True, square=True, linewidths=0.5, cbar_kws={"shrink": 0.75}
    )
    plt.title(f"Correlation Matrix for Index '{selected_index}'")
    plt.show()


def find_columns_with_high_missing_percentage(df, threshold=0.5, exclude_columns=None):
  """
  Finds columns in a DataFrame with a missing value percentage exceeding a specified threshold,
  optionally excluding specified columns from the check.

  Args:
    df: The pandas DataFrame to analyze.
    threshold: The threshold for the missing value percentage (default: 0.7, which is 70%).
    exclude_columns: A list of column names to exclude from the missing value check.  Defaults to None (no columns excluded).

  Returns:
    A list of column names that have a missing value percentage greater than the threshold,
    excluding any columns specified in `exclude_columns`.  Returns an empty list if no
    columns exceed the threshold after exclusion.
  """

  # Create a copy of the DataFrame to avoid modifying the original.  This is IMPORTANT.
  df_to_check = df.copy()

  # Exclude specified columns, if any
  if exclude_columns:
    df_to_check = df_to_check.drop(columns=exclude_columns, errors='ignore')  # Use errors='ignore'

  missing_percentages = df_to_check.isnull().sum() / len(df_to_check)
  columns_to_drop = missing_percentages[missing_percentages > threshold].index.tolist()
  return columns_to_drop

### IMPORT AND UNDERSTAND DATA

In this part of the code all the data is loaded:

- _metadata.csv_: contains the demographics information (age and gender), dataset spit (train or test), has and file size.
- _test_features.csv_: Test set.
- _train_features.csv_: Train set
- _train_labels.csv_: To Predict


In [6]:
# Create a path object
dataset_dir = cfgs["DATASET_DIR"]
dataset_path = Path(dataset_dir)
print(f"Dataset: {dataset_path}")
print("\n")

# # Find all text files inside a directory
# files = list(dataset_path.glob("*.csv"))

# Combining multiple paths
path_metadata = dataset_path / "metadata.csv"
path_trainfeatures = dataset_path / "train_features.csv"
path_trainlabels = dataset_path / "train_labels.csv"
path_testfeatures = dataset_path / "test_features.csv"
path_submissionformat = dataset_path / "submission_format.csv"

print(f"MetaData File Path --> {path_metadata}")
print("\n")
print(f"Train Features File Path --> {path_trainfeatures}")
print(f"Train Labels File Path --> {path_trainlabels}")
print("\n")
print(f"Test Features File Path --> {path_testfeatures}")
print("\n")
print(f"Submission File Path --> {path_submissionformat}")

Dataset: C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified


MetaData File Path --> C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified\metadata.csv


Train Features File Path --> C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified\train_features.csv
Train Labels File Path --> C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified\train_labels.csv


Test Features File Path --> C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified\test_features.csv


Submission File Path --> C:\Users\maz\dev\Projects\proj_alzheimer_research\dataset\modified\submission_format.csv


In [7]:
# Import the data as dataframe format
# metadata       = pd.read_csv(path_metadata)
# metadata.set_index('uid', inplace=True)

train_features      = pd.read_csv(path_trainfeatures, encoding = 'utf8')
train_labels        = pd.read_csv(path_trainlabels, encoding = 'utf8')

test_features       = pd.read_csv(path_testfeatures, encoding = 'utf8')
submission_format   = pd.read_csv(path_submissionformat, encoding = 'utf8')

In [8]:
train_features.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 184 columns):
 #    Column             Non-Null Count  Dtype  
---   ------             --------------  -----  
 0    uid                3276 non-null   object 
 1    age_03             2240 non-null   object 
 2    urban_03           2242 non-null   object 
 3    married_03         2242 non-null   object 
 4    n_mar_03           2222 non-null   float64
 5    edu_gru_03         2232 non-null   object 
 6    n_living_child_03  2230 non-null   object 
 7    migration_03       2241 non-null   float64
 8    glob_hlth_03       2104 non-null   object 
 9    adl_dress_03       2105 non-null   float64
 10   adl_walk_03        2235 non-null   float64
 11   adl_bath_03        2235 non-null   float64
 12   adl_eat_03         2234 non-null   float64
 13   adl_bed_03         2235 non-null   float64
 14   adl_toilet_03      2235 non-null   float64
 15   n_adl_03           2234 non-null   float64
 16   iadl

In [9]:
# train_features.head()

In [10]:
print(f"In the training data we have " + str(train_features.shape[1]) + " columns")
print(f"In the test data we have " + str(test_features.shape[1]) + " columns")

In the training data we have 184 columns
In the test data we have 184 columns


In [11]:
print(f"In the training data we have " + str(train_features["uid"].nunique()) + " unique patients and there are " + str(train_features.shape[0]) + " rows in the training data")
print(f"In the label data we have " + str(train_labels["uid"].nunique()) + " unique patients and there are " + str(train_labels.shape[0]) + " rows in the training label data")

In the training data we have 3276 unique patients and there are 3276 rows in the training data
In the label data we have 3276 unique patients and there are 4343 rows in the training label data


We see that there are more rows in the label file than we have rows in the train features. This is because we want to estimate the composite score for 2016 (4 years in the future) and 2021 (9 years in the future) for some patients. This is also the case for the test data:

In [12]:
print(f"In the test data we have " + str(test_features["uid"].nunique()) + " unique patients and there are " + str(test_features.shape[0]) + " rows in the test data")
print(f"In the label data we have " + str(submission_format["uid"].nunique()) + " unique patients and there are " + str(submission_format.shape[0]) + " rows in the test label data")

In the test data we have 819 unique patients and there are 819 rows in the test data
In the label data we have 819 unique patients and there are 1105 rows in the test label data


In [13]:
has_missing = train_features.isnull().sum()
has_missing

uid              0
age_03        1036
urban_03      1034
married_03    1034
n_mar_03      1054
              ... 
a21_12        3234
a22_12        3240
a33b_12       3234
a34_12        1164
j11_12          75
Length: 184, dtype: int64

In [36]:
columns_with_high_missing = find_columns_with_high_missing_percentage(train_features, threshold=0.40, exclude_columns=None)

if columns_with_high_missing:
    print("Columns with more than 40% missing values:", columns_with_high_missing)
    print("Total Columns with more than 40% missing values:", len(columns_with_high_missing))
else:
    print("No columns have more than 40% missing values.")

Columns with more than 40% missing values: ['bmi_03', 'decis_famil_03', 'sgender_03', 'rjob_hrswk_03', 'rjlocc_m_03', 'rjob_end_03', 'rjobend_reason_03', 'searnings_03', 'sinc_pension_03', 'rjob_hrswk_12', 'rjlocc_m_12', 'rjob_end_12', 'rjobend_reason_12', 'a16a_12', 'a21_12', 'a22_12', 'a33b_12']
Total Columns with more than 40% missing values: 17


In [29]:
columns_with_high_missing = find_columns_with_high_missing_percentage(train_features, threshold=0.50, exclude_columns=None)

if columns_with_high_missing:
    print("Columns with more than 50% missing values:", columns_with_high_missing)
    print("Total Columns with more than 50% missing values:", len(columns_with_high_missing))
else:
    print("No columns have more than 50% missing values.")

Columns with more than 50% missing values: ['bmi_03', 'decis_famil_03', 'rjob_hrswk_03', 'rjlocc_m_03', 'rjob_end_03', 'rjobend_reason_03', 'rjob_hrswk_12', 'rjob_end_12', 'rjobend_reason_12', 'a16a_12', 'a21_12', 'a22_12', 'a33b_12']
Total Columns with 50% missing values: 13


In [30]:
columns_with_high_missing = find_columns_with_high_missing_percentage(train_features, threshold=0.60, exclude_columns=None)

if columns_with_high_missing:
    print("Columns with more than 60% missing values:", columns_with_high_missing)
    print("Total Columns with more than 60% missing values:", len(columns_with_high_missing))
else:
    print("No columns have more than 60% missing values.")

Columns with more than 60% missing values: ['rjob_hrswk_03', 'rjlocc_m_03', 'rjob_end_03', 'rjobend_reason_03', 'rjob_hrswk_12', 'rjob_end_12', 'rjobend_reason_12', 'a16a_12', 'a21_12', 'a22_12', 'a33b_12']
Total Columns with 60% missing values: 11


In [35]:
columns_with_high_missing = find_columns_with_high_missing_percentage(train_features, threshold=0.70, exclude_columns=None)

if columns_with_high_missing:
    print("Columns with more than 70% missing values:", columns_with_high_missing)
    print("Total Columns with more than 70% missing values:", len(columns_with_high_missing))
else:
    print("No columns have more than 70% missing values.")

Columns with more than 70% missing values: ['rjlocc_m_03', 'rjob_end_03', 'rjobend_reason_03', 'rjob_end_12', 'rjobend_reason_12', 'a16a_12', 'a21_12', 'a22_12', 'a33b_12']
Total Columns with 70% missing values: 9


In [14]:
# Count number of patinents with composite score at both 2016 and 2021 in the training data
(train_labels.groupby("uid").count()["year"] == 2).value_counts()

year
False    2209
True     1067
Name: count, dtype: int64

In [15]:
# Count number of patinents we need to predict composite for both 2016 and 2021 in the test data
(submission_format.groupby("uid").count()["year"] == 2).value_counts()

year
False    533
True     286
Name: count, dtype: int64

In [16]:
# How many columns have at least one missing value (training data)
(train_features.isna().sum() != 0).value_counts()

True     182
False      2
Name: count, dtype: int64

In [17]:
# How many columns have at least one missing value (test data)
(test_features.isna().sum() != 0).value_counts()

True     182
False      2
Name: count, dtype: int64

In [18]:
round((train_features.isna().sum().sum() / (train_features.shape[0]*train_features.shape[1]))*100,2)

22.45

In [19]:
print("In total there are " + str(round((train_features.isna().sum().sum() / (train_features.shape[0]*train_features.shape[1]))*100,2)) + " % missing values in the training data")

In total there are 22.45 % missing values in the training data


In [20]:
print("In total there are " + str(round((test_features.isna().sum().sum() / (test_features.shape[0]*test_features.shape[1]))*100,2)) + " % missing values in the test data")

In total there are 21.92 % missing values in the test data


### PreProcessing

In [21]:
# duplicate features where we need to estimate composite score for both 2016 and 2021 (training data)
train_data = train_labels.merge(train_features, on="uid")

In [22]:
train_data.columns

Index(['uid', 'year', 'composite_score', 'age_03', 'urban_03', 'married_03',
       'n_mar_03', 'edu_gru_03', 'n_living_child_03', 'migration_03',
       ...
       'rrelgimp_12', 'rrfcntx_m_12', 'rsocact_m_12', 'rrelgwk_12', 'a16a_12',
       'a21_12', 'a22_12', 'a33b_12', 'a34_12', 'j11_12'],
      dtype='object', length=186)

In [23]:
train_data["pred_year"] = train_data["year"]-2012

In [24]:
train_data.columns

Index(['uid', 'year', 'composite_score', 'age_03', 'urban_03', 'married_03',
       'n_mar_03', 'edu_gru_03', 'n_living_child_03', 'migration_03',
       ...
       'rrfcntx_m_12', 'rsocact_m_12', 'rrelgwk_12', 'a16a_12', 'a21_12',
       'a22_12', 'a33b_12', 'a34_12', 'j11_12', 'pred_year'],
      dtype='object', length=187)

In [25]:
train_data.pred_year

0       9
1       9
2       4
3       9
4       9
       ..
4338    9
4339    4
4340    9
4341    9
4342    9
Name: pred_year, Length: 4343, dtype: int64

In [26]:
# duplicate features where we need to estimate composite score for both 2016 and 2021 (test data)
aligned_test_features = submission_format[["uid","year"]].merge(test_features, on="uid")
aligned_test_features["pred_year"] = aligned_test_features["year"]-2012

In [27]:
# Separate features and target variable
X = train_data.drop(columns=['uid', 'year', 'composite_score'])
y = train_data['composite_score']

In [28]:
X.head()

Unnamed: 0,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,adl_walk_03,...,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12,pred_year
0,,,,,,,,,,,...,9.Never,9.Never,0.No,,,,,,Concrete 2,9
1,,,,,,,,,,,...,9.Never,1.Almost every day,0.No,,,,,,Concrete 2,9
2,,,,,,,,,,,...,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1",4
3,,,,,,,,,,,...,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1",9
4,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,4. Fair,0.0,0.0,...,4.Once a week,9.Never,1.Yes,,,,,No 2,Concrete 2,9


In [29]:
# Handle missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [30]:
# Select numerical and categorical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [31]:
# Impute training features
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [32]:
# # Encode categorical variables
# label_encoders = {}
# for col in cat_cols:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     label_encoders[col] = le

In [33]:
X.head()

Unnamed: 0,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,adl_walk_03,...,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12,pred_year
0,1. 50–59,"1. 100,000+",1. Married or in civil union,1.126879,1. 1–5 years,2. 3 or 4,0.099065,4. Fair,0.041514,0.017708,...,9.Never,9.Never,0.No,1974.911765,7.254545,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2,9.0
1,1. 50–59,"1. 100,000+",1. Married or in civil union,1.126879,1. 1–5 years,2. 3 or 4,0.099065,4. Fair,0.041514,0.017708,...,9.Never,1.Almost every day,0.No,1974.911765,7.254545,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2,9.0
2,1. 50–59,"1. 100,000+",1. Married or in civil union,1.126879,1. 1–5 years,2. 3 or 4,0.099065,4. Fair,0.041514,0.017708,...,6.2 or 3 times a month,2.4 or more times a week,0.No,1974.911765,7.254545,Agriculture/ Animal breeding 01,Neither 3,No 2,"Wood, mosaic, or other covering 1",4.0
3,1. 50–59,"1. 100,000+",1. Married or in civil union,1.126879,1. 1–5 years,2. 3 or 4,0.099065,4. Fair,0.041514,0.017708,...,6.2 or 3 times a month,2.4 or more times a week,0.No,1974.911765,7.254545,Agriculture/ Animal breeding 01,Neither 3,No 2,"Wood, mosaic, or other covering 1",9.0
4,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,4. Fair,0.0,0.0,...,4.Once a week,9.Never,1.Yes,1974.911765,7.254545,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2,9.0


In [34]:
y.head(2)

0    175
1    206
Name: composite_score, dtype: int64

In [35]:
type(y)

pandas.core.series.Series

In [36]:
# analyzing the dataset
report = sv.analyze(X)

                                             |                                       | [  0%]   00:00 -> (? le…

  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)


In [38]:
# show the report in a form of an HTML file
VIS_DIR = cfgs["VISUALIZATION_DIR"]
path_vis = Path(VIS_DIR)
path_visReport = path_vis / "alzheimer_eda.html"
print(f"EDA Report: {path_visReport}")
print("\n")
report.show_html(path_visReport)

EDA Report: C:\Users\maz\dev\Projects\proj_alzheimer_research\visualization\alzheimer_eda.html


Report C:\Users\maz\dev\Projects\proj_alzheimer_research\visualization\alzheimer_eda.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
