In [None]:
import sys
sys.path.append('..')
from scripts.data_analysis import DataAnalysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
original_df = pd.read_csv('../data/data_wave1.csv')

In [None]:
columns_to_extract = [
    'pident', 'aApoB', 'aHDL_C', 'aTotFA', 'aSerum_TG', 'aGp', 'aIle', 'Sexe', 'Age',
    'aauditsc', 'aedu', 'AIPMETO2', 'asmokstat', 'acidep09', 'ahsCRP', 'aIL6',
    'aIRSsum9', 'ams_waist', 'ams_hpt', 'ams_trig2', 'ams_hdl2', 'ams_gluc2',
    'amet_syn2', 'atri_med', 'ahdl_med', 'asbp_med', 'adbp_med', 'agluc_med', 'abaiscal',
    'aids'
]

extracted_df = original_df[columns_to_extract]

extracted_csv_file = '../data/wave1_data_to_discretize.csv'
extracted_df.to_csv(extracted_csv_file, index=False)

In [None]:
def classify_data(df):
    """
    Classify each column in a pandas DataFrame as continuous, discrete, binary, or categorical.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to classify.

    Returns:
    dict: A dictionary with column names as keys and their classifications as values.
    """
    def classify_series(series):
        """
        Classify a pandas Series as continuous, discrete, binary, or categorical.
        
        Parameters:
        series (pd.Series): The Series from a column to classify.

        Returns:
        str: The classification of the series ('continuous', 'discrete', 'binary', 'categorical').
        """
        if pd.api.types.is_numeric_dtype(series):
            unique_count = series.nunique()
            total_count = len(series)
            
            if unique_count == 2:
                return 'binary'
            elif unique_count < 10:
                return 'discrete'
            else:
                return 'continuous'
        else:
            return 'categorical'

    column_classifications = {col: classify_series(df[col]) for col in df.columns}
    return column_classifications

In [None]:
data = pd.read_csv('../data/wave1_data_to_discretize.csv')

In [None]:
# replace -1 in columns with NaN
columns_with_minus_one = [
    'ahsCRP', 'aIL6', 'ams_waist', 'ams_hpt', 'ams_trig2', 'ams_hdl2', 'ams_gluc2',
    'amet_syn2'
]

for col in columns_with_minus_one:  
  data[col] = data[col].replace(-1, np.nan)

# in sleeping pattern, -3 and -2 means no data
data['aIRSsum9'] = data[col].replace(-3, np.nan)
data['aIRSsum9'] = data[col].replace(-2, np.nan)

In [None]:
column_types = classify_data(data)
column_types

In [None]:
DataAnalysis.plot_missing_values(data)

In [None]:
from scipy.stats import chi2_contingency

def perform_little_mcar_test(df):
    results = []

    for col_with_missing_data in df.columns[df.isnull().any()]:
        # Create a mask indicating where data is missing
        missing_data = df[col_with_missing_data ].isnull().astype(int)

        # Create contingency table for Chi-Squared test
        contingency_table = pd.crosstab(missing_data, df.drop(columns=[col_with_missing_data]).isnull().any(axis=1).astype(int))

        # Perform Chi-Squared test
        chi2_stat, p_val, dof, _ = chi2_contingency(contingency_table, correction=False)

        # Append results
        results.append({'Column': col_with_missing_data, 'Chi-Squared': chi2_stat, 'df': dof, 'p-value': p_val})

    return pd.DataFrame(results)

perform_little_mcar_test(data)

Data is not missing completely at random so we cannot discard it

In [None]:
columns_to_process = [col for col in columns_to_extract if col != 'pident']

In [None]:
DataAnalysis.plot_correlation_matrix(data[columns_to_process], columns_to_process, 'Variables to Construct Causal Network')

Now, we are going to determine the imputation strategy for each column with missing data based on correlation matrix

In [None]:
corr_matrix = data[columns_to_process].corr().abs()


In [None]:
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def impute_missing_values(df, corr_matrix, column_types):
    imputed_df = df.copy()

    if 'amet_syn2' in df.columns:
        correlated_vars = ['ams_trig2', 'ams_hdl2', 'ams_hpt', 'ams_gluc2', 'atri_med', 'ahdl_med', 'asbp_med', 'adbp_med', 'agluc_med']
        imputation_vars = ['amet_syn2'] + correlated_vars
        data_for_imputation = df[imputation_vars]
        
        iter_imputer = IterativeImputer(max_iter=100, random_state=42)
        imputed_data = iter_imputer.fit_transform(data_for_imputation)
        # convert to binary
        imputed_df['amet_syn2'] = np.round(imputed_data[:, 0]).astype(int)

    for col in df.columns[df.isnull().any()]:
        if col == 'amet_syn2':
            continue

        col_corr = corr_matrix[col].drop(col)
        max_corr = col_corr.max()

        # use MICE for highly correlated columns
        if max_corr > 0.6:
            mice_imputer = IterativeImputer(random_state=42)
            imputed_df[col] = mice_imputer.fit_transform(df[[col] + col_corr.index.tolist()])[:, 0]
        # use kNN for everything else
        else:
            knn_imputer = KNNImputer(n_neighbors=10)
            imputed_df[col] = knn_imputer.fit_transform(df[[col] + col_corr.index.tolist()])[:, 0]
        
        if column_types[col] == 'binary':
            imputed_df[col] = np.round(imputed_df[col]).astype(int)

    return imputed_df

In [None]:
imputed_df = impute_missing_values(data[columns_to_process], corr_matrix, column_types)

In [None]:
DataAnalysis.plot_missing_values(imputed_df)

In [None]:
classify_data(imputed_df)

In [None]:
imputed_df.reset_index(drop=True, inplace=True)
data.reset_index(drop=True, inplace=True)
imputed_df['pident'] = data['pident']
imputed_df.set_index('pident', inplace=True)

In [None]:
imputed_df.to_csv('../data/network/imputed_data_wave1.csv')

## Transform continuous variables to discrete

In [None]:
preprocessed_df = pd.read_csv('../data/network/imputed_data_wave1.csv')

In [None]:
preprocessed_df

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

def apply_kmeans(df, column_name, n_clusters=3, random_state=0):
    data = df[column_name].values.reshape(-1, 1)
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(data)
    
    centroids = kmeans.cluster_centers_
    sorted_idx = np.argsort(centroids.ravel())

    # 1 for "Low", 2 for "Moderate", 3 for "High"
    ordinal_labels = {old_label: new_label for old_label, new_label in zip(sorted_idx, range(1, n_clusters + 1))}
    df[column_name] = [ordinal_labels[x] for x in kmeans.labels_]
    
    colors = ['#3B28CC', '#3F8EFC', '#ADD7F6']
    plt.figure(figsize=(8, 6))
    for label in range(1, n_clusters + 1):
        subset = df[df[f'{column_name}'] == label]
        plt.hist(subset[f'{column_name}'], bins=np.arange(1, n_clusters + 2) - 0.5, color=colors[label-1], label=f'Cluster {label}', alpha=0.75, edgecolor='black')

    plt.title(f'Clusters of {column_name}')
    plt.xlabel('Label')
    plt.ylabel('Frequency')
    plt.xticks(range(1, n_clusters + 1))
    plt.legend()
    plt.show()
    
    return df


In [None]:
#aApoB
bins = [-float('inf'), 1.3, float('inf')]
# Low: 1, High: 2
labels = [1, 2]
preprocessed_df['aApoB'] = pd.cut(preprocessed_df['aApoB'], bins=bins, labels=labels, right=False)
preprocessed_df['aApoB'].unique()


In [None]:
#aTotFA
preprocessed_df = apply_kmeans(preprocessed_df, 'aTotFA', n_clusters=3, random_state=0)


In [None]:
preprocessed_df['aHDL_C'] = preprocessed_df.apply(
    lambda row: 1 if (row['Sexe'] == 1 and row['aHDL_C'] > 1.03) or (row['Sexe'] == 2 and row['aHDL_C'] > 1.29) else 2,
    axis=1
)

# Desirable: 1, At risk: 2

In [None]:
bins = [-float('inf'), 1.69, 2.26, 5.65, float('inf')]
# 'Desirable': 1, 'Mild Hypertriglyceridemia': 2, 'High Hypertriglyceridemia': 3, 'Very High Hypertriglyceridemia': 4
labels = [1, 2, 3, 4]
preprocessed_df['aSerum_TG'] = pd.cut(preprocessed_df['aSerum_TG'], bins=bins, labels=labels)
preprocessed_df['aSerum_TG'].unique()


In [None]:
preprocessed_df['aGp'] = preprocessed_df['aGp'].apply(lambda x: 1 if x <= 1.2 else 2)
preprocessed_df['aGp'].unique()
# Normal: 1, At risk: 2

In [None]:
preprocessed_df = apply_kmeans(preprocessed_df, 'aIle', n_clusters=3, random_state=0)



In [None]:
def categorize_met(met):
    moderate = 4.9 * 60 * 3 # assuming 3 sessions of 1 hour each per week
    vigorous = 6.8 * 60 * 3
    if met >= moderate and met < vigorous:  
        return 2 # Moderate
    elif met >= 6.8 * 60 * 3: 
        return 3 # Vigorous
    else:
        return 1 # Low

preprocessed_df['AIPMETO2'] = preprocessed_df['AIPMETO2'].apply(categorize_met)
print(preprocessed_df['AIPMETO2'].value_counts())

In [None]:
preprocessed_df['aIL6'] = pd.cut(preprocessed_df['aIL6'],
                                bins=[-float('inf'), 7, float('inf')],
                                labels=[1, 2], # Normal: 1, High: 2
                                right=True) 

print(preprocessed_df['aIL6'].value_counts())

In [None]:
preprocessed_df['ahsCRP'] = pd.cut(preprocessed_df['ahsCRP'],
                                  bins=[-float('inf'), 0.3, 1, 10, 50, float('inf')],
                                  labels=[1, 2, 3, 4, 5],
                                  right=False)
print(preprocessed_df['ahsCRP'].value_counts())
# Normal: 1, Minor: 2, Moderate: 3, Marked: 4, Severe: 5

In [None]:
def categorize_atri_med(row):
    adult_threshold = 150 * 0.01129  # mg/dL to mmol/L for adults
    child_threshold = 90 * 0.01129   # mg/dL to mmol/L for children and teens

    if row['Age'] > 19:
        if row['atri_med'] < adult_threshold:
            return 1 # Healthy
        else:
            return 2 # High
    else: 
        if row['atri_med'] < child_threshold:
            return 1 # Healthy
        else:
            return 2 # High

preprocessed_df['atri_med'] = preprocessed_df.apply(categorize_atri_med, axis=1)
print(preprocessed_df['atri_med'].value_counts())


In [None]:
def categorize_ahdl_med(row):
    if row['Sexe'] == 1: 
        return 1 if row['ahdl_med'] > 1.0 else 2
    elif row['Sexe'] == 2: 
        return 1 if row['ahdl_med'] > 1.2 else 2

preprocessed_df['ahdl_med'] = preprocessed_df.apply(categorize_ahdl_med, axis=1)
preprocessed_df['ahdl_med'].value_counts()
# Healthy: 1, Unhealthy: 2

In [None]:
def categorize_blood_pressure(df):
    for index, row in df.iterrows():
        sbp = row['asbp_med']
        dbp = row['adbp_med']
        
        if sbp < 122 and dbp < 80:
            df.at[index, 'asbp_med'] = 1 
            df.at[index, 'adbp_med'] = 1
        elif 120 <= sbp <= 129 and dbp < 80:
            df.at[index, 'asbp_med'] = 2
            df.at[index, 'adbp_med'] = 2
        elif (130 <= sbp <= 139 or 80 <= dbp <= 89):
            df.at[index, 'asbp_med'] = 3
            df.at[index, 'adbp_med'] = 3
        elif sbp >= 140 or dbp >= 90:
            df.at[index, 'asbp_med'] = 4
            df.at[index, 'adbp_med'] = 4
        else:
            df.at[index, 'asbp_med'] = -1
            df.at[index, 'adbp_med'] = -1

categorize_blood_pressure(preprocessed_df)
# Normal: 1, Elevated: 2, Hypertension I: 3, Hypertension II: 4, Uncategorized: -1

In [None]:
def categorize_glucose_level(glucose_mmol):
    glucose_mg_dl = glucose_mmol * 18  # convert mmol/L to mg/dL
    if glucose_mg_dl < 70:
        return 1 # Hypoglycemia
    elif 70 <= glucose_mg_dl <= 100:
        return 2 # Normal
    elif 100 < glucose_mg_dl <= 125:
        return 3 # Prediabetes
    elif glucose_mg_dl >= 126:
        return 4 # Diabetes
    else:
        return -1

preprocessed_df['agluc_med'] = preprocessed_df['agluc_med'].apply(categorize_glucose_level)

preprocessed_df['agluc_med'].value_counts()

In [None]:
preprocessed_df = apply_kmeans(preprocessed_df, 'aauditsc', n_clusters=3, random_state=0)

In [None]:
preprocessed_df = apply_kmeans(preprocessed_df, 'abaiscal', n_clusters=3, random_state=0)

In [None]:
preprocessed_df = apply_kmeans(preprocessed_df, 'aids', n_clusters=3, random_state=0)

In [None]:
def discretize_age(data, column, bins, labels):
    bin_indices = np.digitize(data[column], bins) - 1
    bin_indices = np.clip(bin_indices, 0, len(labels) - 1)
    return [labels[i] for i in bin_indices]

In [None]:
bins = [18, 27, 50, float('inf')]
labels = [1, 2, 3] 
preprocessed_df['Age'] = discretize_age(preprocessed_df, 'Age', bins, labels)
# Young Adult: 1, Adult: 2, Elderly: 3

In [None]:
# save data to build the network
preprocessed_df.to_csv('../data/network/discrete_data_wave1.csv', index=False)

In [None]:
final_data = pd.read_csv('../data/network/discrete_data_wave1.csv')
classify_data(final_data[columns_to_process])