In [None]:
import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('Agg')
%matplotlib inline
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt
import math

from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm
import copy
import os
import shutil

%matplotlib inline
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt


In [None]:
def add_time_difference(df_source, i):
    assignment = {}
    assignment[f"time_difference.0.{i}"] = (pd.to_datetime(df_source[f"date_visit.{i}"]) - pd.to_datetime(df_source["date_visit.0"])).dt.total_seconds() / (24 * 3600)
    df_source = df_source.assign(**assignment)
    return df_source

In [None]:
# function to add log features 
def add_logs(df_, log_features, outcome_components, descr):
    print(f"adding logarithmic features to {descr}")
    assignment = {}
    log_cols = []

    for log_feature in log_features:
        if log_feature in df_.columns:
            new_fld = f"log_{log_feature}"
            assignment[new_fld] = np.log1p(df_[log_feature])
            log_cols = log_cols + [new_fld]
            print(new_fld)
    
    for i in range(0,4):
        
        if f"pairs_matching_sum_incorrect.{i}" in outcome_components:
            new_fld = f"log_pairs_matching_sum_incorrect.{i}"
            if new_fld not in assignment:
                assignment[new_fld] = np.log1p(df_[f"pairs_matching_sum_incorrect.{i}"])
                print(new_fld)
                
        
        if f"pairs_matching_incorrect_in_round_1.{i}" in outcome_components:
            new_fld_round = f"log_pairs_matching_incorrect_in_round_1.{i}"
            if new_fld_round not in assignment:
                assignment[new_fld_round] = np.log1p(df_[f"pairs_matching_incorrect_in_round_1.{i}"])  
                print(new_fld) 

    df_ = df_.assign(**assignment)
    return df_, log_cols

In [None]:
# function to quadr features 
def add_squares(df_, q_features, descr, skewed_only_on_log, skewed_features):
    print(f"adding quadratic features to {descr}")
    assignment = {}
    quadr_cols = []
    
    for feat in df_.columns:
        for q_feat in q_features:
            if q_feat in feat:
                if not skewed_only_on_log or ("log_" in feat) or not (feat in skewed_features):
                    new_fld = f"quadr_{feat}"
                    assignment[new_fld] = np.square(df_[feat])
                    quadr_cols = quadr_cols + [new_fld]
                    print(new_fld)
    
    

    df_ = df_.assign(**assignment)
    return df_, quadr_cols

In [None]:
# function for removing outliers via iqr approach
def remove_outliers_iqr(df_, iqr_cf, df_name, outliers, log_feat):
    if iqr_cf is not None:
        
        lower_bound = {}
        upper_bound = {}
        
        for feat  in outliers:
            print(f"{feat}")
            
            Q1 = df_[feat].quantile(0.25)
            Q3 = df_[feat].quantile(0.75)
            
            IQR = Q3-Q1
            lower_bound[feat] = Q1 - iqr_cf * IQR
            upper_bound[feat] = Q3 + iqr_cf * IQR
            
            if feat in log_feat:
                feat=f"log_{feat}"
                print(f"logarithimic outlier {feat}")
            
                Q1 = df_[feat].quantile(0.25)
                Q3 = df_[feat].quantile(0.75)
                
                IQR = Q3-Q1
                lower_bound[feat] = Q1 - iqr_cf * IQR
                upper_bound[feat] = Q3 + iqr_cf * IQR
            
            
                
         
            
        for feat in lower_bound:
            df_= df_[(df_[feat] >= lower_bound[feat]) & (df_[feat] <= upper_bound[feat])]
            
        print(f"data set {df_name} after removing outliers: {len(df_)}")
    return(df_)

In [None]:
# function which adds the column with the RV scores to the data set
def adding_rv_scores(df_source, mem_experiment_full, repeat):
    assignment = {}
    weight_mem = 1
    if "pairs_matching" in mem_experiment_full:
        weight_mem = -1
        
    sum_mem_0 = df_source[f"{mem_experiment_full}.0"].sum()
    mem_mean_0 = df_source[f"{mem_experiment_full}.0"].mean()
    mem_std_0 = df_source[f"{mem_experiment_full}.0"].std()
    assignment[f'z_{mem_experiment_full}.0'] = (df_source[f"{mem_experiment_full}.0"] - mem_mean_0)/mem_std_0
    
    sum_mem_repeat = df_source[f"{mem_experiment_full}.{repeat}"].sum()
    mem_mean_repeat = df_source[f"{mem_experiment_full}.{repeat}"].mean()
    mem_std_repeat = df_source[f"{mem_experiment_full}.{repeat}"].std()
    assignment[f'z_{mem_experiment_full}.{repeat}'] = (df_source[f"{mem_experiment_full}.{repeat}"] - mem_mean_0)/mem_std_0 
    
    sum_sg_rt_mean_0 = df_source["snap_game_true_pos_rt_avrg.0"].sum()
    sg_rt_mean_0 = df_source["snap_game_true_pos_rt_avrg.0"].mean()
    sg_rt_std_0 = df_source["snap_game_true_pos_rt_avrg.0"].std()
    assignment['z_snap_game_true_pos_rt_avrg.0'] = (df_source["snap_game_true_pos_rt_avrg.0"] - sg_rt_mean_0)/sg_rt_std_0

    sum_sg_rt_mean_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].sum()
    sg_rt_mean_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].mean() 
    sg_rt_std_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].std()
    assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] = (df_source[f"snap_game_true_pos_rt_avrg.{repeat}"] - sg_rt_mean_0)/sg_rt_std_0 
    
     # generate Global_v00 = (zDSTf_v00 + zDSTb_v00 - zTMTa_v00 - zTMTb_v00) / 4
    assignment["global.0"] = (- assignment['z_snap_game_true_pos_rt_avrg.0'] + weight_mem * assignment[f'z_{mem_experiment_full}.0'])/2.0
    sum_global_0 = assignment["global.0"].sum()
    mean_global_0 = assignment["global.0"].mean()
    std_global_0 = assignment["global.0"].std()
    assignment['z_global.0'] = (assignment["global.0"] - mean_global_0)/std_global_0

    assignment[f"global.{repeat}"] = (- assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] + weight_mem * assignment[f'z_{mem_experiment_full}.{repeat}'])/2.0 
    assignment[f"z_global.{repeat}"] = (assignment[f"global.{repeat}"] - mean_global_0)/std_global_0
    
    assignment[f"z_change_{mem_experiment_full}.0.{repeat}"] = assignment[f'z_{mem_experiment_full}.{repeat}'] - assignment[f'z_{mem_experiment_full}.0']
    assignment[f"z_change_snap_game_true_pos_rt_avrg.0.{repeat}"] = assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] - assignment['z_snap_game_true_pos_rt_avrg.0']
    df_source = df_source.assign(**assignment) 
    return df_source  

In [None]:
def add_median_score(df_source, predictors, score_clmn, n_steps):
    
    assignment = {}
    assignment[f"median_{score_clmn}"] = None
    df_source = df_source.assign(**assignment)
    
    step = {}
    for clmn in predictors:
        
        step[clmn] = (df_source[clmn].max() - df_source[clmn].min())/n_steps
        
    for i in range(len(df_source)):
        print(f"participant {i}")
        df_filter = copy.deepcopy(df_source)
        
        for clmn in predictors:
           
            left = df_source.at[i, clmn] - step[clmn]
            right = df_source.at[i, clmn] + step[clmn]
            df_filter = df_filter[df_filter[clmn].ge(left) & df_filter[clmn].le(right)]
        
        current_median = df_filter[score_clmn].median()
       
        df_source.at[i, f"median_{score_clmn}"] = current_median
    
    return df_source
         

In [None]:
n_samples =None
n_steps_for_median = 10

memory_experiment = "pm"

binary_smoke = False

second_visit = 2
age_limit = 0
gender = None
iqr_coefficient = None # if None then no standrat removal of outliers

keep_ldl_hdl = True # if lasso selects only one of them keep the second anyway; they are correlated but have different influence
quadr_on_log_if_skewed = True

input_dir = "/projects/prime/ukbb/preprocessed_data_2024"
imputed_dir = f"/projects/prime/ukbb/preprocessed_data_2024/imputations/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1"
path_imputed =  f"{imputed_dir}/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1.csv"
input_filenames = {
    "diet": "diet_2024_bradbury_march.csv",
    "alcohol": "alcohol_2024_maart.csv",
    "diagnoses": "diagnoses_2024.csv",
    "lifestyle": "participants_age_date_smoke_MET_2024_jan.csv",
    "medications": "participants_medications_2024_jan.csv",
    "risk_factors": "blood_count_biochemistry_pressure_maart_2024.csv",
    "sociodemo": "edu_job_marriage_2024.csv",
    "cognition": "cognitive_2024_maart.csv",
    "metrics": "body_size_measures_gender_dates_2024_jan.csv"
  }

output_dir =f"/projects/prime/ukbb/results_2024/{memory_experiment}_sg_0_{second_visit}_median_05_06_all"

if gender is None:
  output_dir = f"{output_dir}/"
else:
  output_dir = f"{output_dir}_gender_{gender}/"
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

In [None]:

out_data_file = f"{output_dir}/data_non_na.csv"
out_imputed_data_file = f"{output_dir}/data_ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1.csv"

In [None]:
if memory_experiment == "nm":
    memory_experiment_full_name = "num_memory_max_digits_remembered_correctly"
else:
    memory_experiment_full_name = "pairs_matching_sum_incorrect"

In [None]:
'''
predictors = ["gender.0", f"time_difference.0.{second_visit}", "Diabetes_2.0",  "Dyslipidemia.0", "Depression.0", "Hypertension.0", 
  "age_years.0", "education_level.0", "marital_status.0", "pp_smoke_catgeory.0", "MET_score.0", "waist_cm.0",  "bmi_kg_m2.0", 
  "Syst_bp.0", "Diast_bp.0", "ldl_conv.0", "hdl_conv.0", "Triglycerides_conv.0", "HbA1c_conv.0",  "Pl_glucose_conv.0",   "Albumin.0", 
  "Leukocyte_count.0", "C_reactive_protein.0",
   "oily_fish_gpday_bradbury.0", "white_fish_gpday_bradbury.0", 
   "red_meat_bradbury_gpd.0", "poultry_gpday_bradbury.0",
   "processed_meat_gpday_bradbury.0", "veg_gpday_bradbury.0",
   "fruit_gpday_bradbury.0",
    "cereals_gpday_bradbury.0", "bread_gpday_bradbury.0", "cheese_gpday_bradbury.0",
    "milk_gpday_bradbury.0", "tea_gpday_bradbury.0",
    "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0",
   "aspirin.0",  "anxiety_tr.0", "pain_tr.0", "TAZD_Thiazide.0",  
   "loop_diuretics.0",  "potassium_diuretics.0", "beta_blockers.0", "calcium_antagonists.0", 
   "ARA_II_Antagonists_of_angiotensin_II_receptors.0", "IECA_Angiotensin_converting_enzyme_inhibitors.0",
   "Other_Hypotensive.0","hypochol_statins.0", "hypochol_others.0", "insulin.0", "sulfonylurea.0", "thiazolidinediones.0", "non_sulfonylurea_insulin_secretagogues.0", "metformin_category.0", "vitamins_minerals.0"]
   '''

In [None]:
predictors = ["gender.0", "age_years.0", "education_level.0", 
   f"time_difference.0.{second_visit}",
   "oily_fish_gpday_bradbury.0", "white_fish_gpday_bradbury.0", 
   "red_meat_bradbury_gpd.0", "poultry_gpday_bradbury.0",
   "processed_meat_gpday_bradbury.0", "veg_gpday_bradbury.0",
   "fruit_gpday_bradbury.0",
    "cereals_gpday_bradbury.0", "bread_gpday_bradbury.0", "cheese_gpday_bradbury.0",
    "milk_gpday_bradbury.0", "tea_gpday_bradbury.0",
    "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0"]


In [None]:
predictors_to_plot = [f"time_difference.0.{second_visit}", 
  "age_years.0", "MET_score.0", "waist_cm.0",  "bmi_kg_m2.0", 
  "Syst_bp.0", "Diast_bp.0", "ldl_conv.0", "hdl_conv.0", "Triglycerides_conv.0", "HbA1c_conv.0",  "Pl_glucose_conv.0",   "Albumin.0", 
  "Leukocyte_count.0", "C_reactive_protein.0",
   "oily_fish_gpday_bradbury.0", "white_fish_gpday_bradbury.0", 
   "red_meat_bradbury_gpd.0", "poultry_gpday_bradbury.0",
   "processed_meat_gpday_bradbury.0", "veg_gpday_bradbury.0",
   "fruit_gpday_bradbury.0",
    "cereals_gpday_bradbury.0", "bread_gpday_bradbury.0", "cheese_gpday_bradbury.0",
    "milk_gpday_bradbury.0", "tea_gpday_bradbury.0",
    "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0"
    ]

In [None]:
outliers_standart = ["Syst_bp.0", "Diast_bp.0"]

In [None]:
simple_predictors =["age_years.0", "gender.0", "education_level.0", f"log_time_difference.0.{second_visit}"]
if (gender is not None) and ("gender.0") in simple_predictors:
    simple_predictors.remove("gender.0") 

In [None]:
outcome_components = [f"{memory_experiment_full_name}.0", f"{memory_experiment_full_name}.{second_visit}", 
            "snap_game_true_pos_rt_avrg.0", f"snap_game_true_pos_rt_avrg.{second_visit}"]

In [None]:
log_features = [f"time_difference.0.{second_visit}", "age_years.0", "MET_score.0", 
  "Triglycerides_conv.0", "HbA1c_conv.0",  "Pl_glucose_conv.0",   
  "Leukocyte_count.0", "C_reactive_protein.0",
  "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0"
    ]

In [None]:
quadr_features = ["MET_score.0", "waist_cm.0",  "bmi_kg_m2.0", 
  "Syst_bp.0", "Diast_bp.0", "ldl_conv.0", "hdl_conv.0", "Triglycerides_conv.0", "HbA1c_conv.0",  "Pl_glucose_conv.0",   "Albumin.0", 
  "Leukocyte_count.0", "C_reactive_protein.0",
   "oily_fish_gpday_bradbury.0", "white_fish_gpday_bradbury.0", 
   "red_meat_bradbury_gpd.0", "poultry_gpday_bradbury.0",
   "processed_meat_gpday_bradbury.0", "veg_gpday_bradbury.0",
   "fruit_gpday_bradbury.0",
    "cereals_gpday_bradbury.0", "bread_gpday_bradbury.0", "cheese_gpday_bradbury.0",
    "milk_gpday_bradbury.0", "tea_gpday_bradbury.0"
    "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0"
    ]

In [None]:
'''outliers = {
    "ldl_conv.0": [0, 290],
    "Syst_bp.0": [0, 220],
}'''
outliers = {}

In [None]:

# very skewed:
remove_original_features = ["MET_score.0", "Triglycerides_conv.0", "Leukocyte_count.0", "C_reactive_protein.0"
                            ]

In [None]:
all_needed = ["f.eid"] + predictors + outcome_components
for i in range(1,4):
    if f"time_difference.0.{i}" in predictors: # will be calculated from dates
        all_needed.remove(f"time_difference.0.{i}")
        all_needed = all_needed + [f"date_visit.{i}"] 
        if "date_visit.0" not in all_needed:
             all_needed = all_needed + ["date_visit.0"] 

In [None]:
# load raw data set and select all not NA
if not os.path.exists(out_data_file):
    df_index = {}
    first = True
    for label, filename in input_filenames.items():
        print(label)
        full_path = f"{input_dir}/{filename}"
        
        data_columns = pd.read_table(full_path, nrows=1, sep=',').columns
        
        needed_fields = data_columns[data_columns.isin(all_needed)]
        
        if label == "medications":
            needed_fields = needed_fields.tolist() + ["alpha_glucosidase_inhibitors.0"] 
            
        df_index[label] = pd.read_table(full_path, sep=',', usecols=needed_fields, dtype=str, nrows=n_samples)
        
        
        print(len(df_index[label]))
        
        for clmn in needed_fields:
            if clmn == "f.eid" or clmn.startswith("date_visit"):
                continue
            df_index[label][clmn] = df_index[label][clmn].astype(float)
        
    
        if first:
            df = df_index[label]
            first = False
        else:
            df = pd.merge(df,df_index[label])
            
        print(len(df))



    # remove NA from the check up data set 
    df = df.dropna(axis="rows")
    print(f"cleaned check up db has {len(df)} rows")

    df= df.loc[df["alpha_glucosidase_inhibitors.0"] != 1]
    print("removed all participants with alpha_glucosidase_inhibitors.0 == 1:")  
    df = df.drop("alpha_glucosidase_inhibitors.0", axis = 1)
    print(len(df))
    
    # filter both data set by age limits (which can be zero)
    if "age_years.0" in df.columns:
        df= df[df["age_years.0"].ge(age_limit)]
        print("removed all participants younger age_limits no NA:") 
        print(len(df))

        # filter both data sets by gender if required
    if gender is not None and "gender.0" in df.columns:
        df= df.loc[df["gender.0"].eq(gender)]
        df.drop(columns=['gender.0'])
        print(f"only participants with gender {gender} are left in the not na data set: ") 
        print(len(df)) 
    
    # add to both data data sets time difference column
    for i in range(1,4):
        if f"time_difference.0.{i}" in predictors:
            df = add_time_difference(df, i)
    
    if binary_smoke:
    
        ind_replace = predictors.index("smoking_score.0")
        predictors[ind_replace] = "binary_smoking_score.0"
        
        assignment = {}
        assignment["binary_smoking_score.0"] =np.where(df["smoking_score.0"] == 2,1, 0) 
        df = df.assign(**assignment)
        
    # remove from both data sets customized outliers
    for feat, out_ in outliers.items():
        df= df[df[feat] >= out_[0] & df[feat] <= out_[1]]
        print(f"data set no na after removing customized outliers: {len(df)}")

    df, log_cols =  add_logs(df, log_features, outcome_components, "check up db with no NA")
    df, quad_cols =  add_squares(df, quadr_features, "check up db with no NA", quadr_on_log_if_skewed, remove_original_features)
    
    
    if iqr_coefficient is not None:
        df = remove_outliers_iqr(df, iqr_coefficient, "no na")
    
    if memory_experiment == "pm":
        memory_experiment = f"log_{memory_experiment}"
        memory_experiment_full_name = f"log_{memory_experiment_full_name}"

    df = adding_rv_scores(df, memory_experiment_full_name, second_visit)
    ## ADD MEDIANS
    outcomes = [f"z_global.{second_visit}", f"z_change_{memory_experiment_full_name}.0.{second_visit}",  f"z_change_snap_game_true_pos_rt_avrg.0.{second_visit}"]
    df.reset_index(drop=True, inplace=True)
    
    if n_steps_for_median is not None:
        for outcome in outcomes:
            df = add_median_score(df, predictors, outcome, n_steps_for_median)
    df.to_csv(out_data_file, sep=',')
else:
    df= pd.read_table(out_data_file, sep=',')

In [None]:
# load imputed data set
if not os.path.exists(out_imputed_data_file):
    df_imputed = pd.read_table(path_imputed, sep=',', usecols=all_needed, dtype=str, nrows=n_samples)
    # make all columns if the imputed except dates, float
    for clmn in df_imputed.columns:
        if clmn == "f.eid" or clmn.startswith("date_visit"):
            continue
        df_imputed[clmn] = df_imputed[clmn].astype(float)
        
    if "age_years.0" in df_imputed.columns:
        df_imputed= df_imputed[df_imputed["age_years.0"].ge(age_limit)]
        print("removed all participants younger age_limits imputed:") 
        print(len(df_imputed))
        
        # filter both data sets by gender if required
    if gender is not None and "gender.0" in df_imputed.columns:
        df_imputed= df_imputed.loc[df_imputed["gender.0"].eq(gender)]
        df_imputed.drop(columns=['gender.0'])
        print(f"only participants with gender {gender} are left in the omputed data set: ") 
        print(len(df_imputed))

    # add to both data data sets time difference column
    for i in range(1,4):
        if f"time_difference.0.{i}" in predictors:
            df_imputed = add_time_difference(df_imputed, i)
    
    if binary_smoke:
    
        if "smoking_score.0" in predictors:
            ind_replace = predictors.index("smoking_score.0")
            predictors[ind_replace] = "binary_smoking_score.0"
        
        assignment = {}
        assignment["binary_smoking_score.0"] =np.where(df_imputed["smoking_score.0"] == 2,1, 0) 
        df_imputed = df_imputed.assign(**assignment)
        
        # remove from both data sets customized outliers
    for feat, out_ in outliers.items():
        df_imputed= df_imputed[(df_imputed[feat] >= out_[0]) & (df_imputed[feat] <= out_[1])]
        print(f"data set imputed after removing customized outliers: {len(df_imputed)}")
    
    df_imputed, _  =  add_logs(df_imputed, log_features, outcome_components,"df with imputed")      
    df_imputed, _  =  add_squares(df_imputed, quadr_features, "df with imputed", quadr_on_log_if_skewed, remove_original_features)    
    # removing outliers via iqr approach
    
    if iqr_coefficient is not None:
        df_imputed = remove_outliers_iqr(df_imputed, iqr_coefficient, "imputed")
    
    if memory_experiment == "pm":
        memory_experiment = f"log_{memory_experiment}"
        memory_experiment_full_name = f"log_{memory_experiment_full_name}"
        
    df_imputed = adding_rv_scores(df_imputed, memory_experiment_full_name, second_visit)
   ## ADD MEDIANS
    outcomes = [f"z_global.{second_visit}", f"z_change_{memory_experiment_full_name}.0.{second_visit}",  f"z_change_snap_game_true_pos_rt_avrg.0.{second_visit}"]
    df_imputed.reset_index(drop=True, inplace=True)
    if n_steps_for_median is not None:
        for outcome in outcomes:
            df_imputed = add_median_score(df_imputed, predictors, outcome, n_steps_for_median)
    df_imputed.to_csv(out_imputed_data_file, sep=',')
else:
    df_imputed = pd.read_table(out_imputed_data_file, sep=',')

In [None]:
# describe population
def describe_population(df_, descr): 
    print(descr)
    comorbidity = ["Diabetes_2.0",  "Dyslipidemia.0", "Depression.0", "Hypertension.0"]
    n_comor = {}
    percentage_comor = {}
    n_comor_female = {}
    percentage_comor_female = {}
    for comor in comorbidity:
        if comor in df_.columns: 
            n_comor[comor] = df_[comor].sum()
            percentage_comor[comor] = (float(n_comor[comor])*100)/float(len(df_))
            print(f"The % of participants with {comor} is {percentage_comor[comor]}")
            if "gender.0" in df_.columns:
                n_comor_female[comor] = (df_[comor] + df_["gender.0"]).eq(2.0).sum()
                percentage_comor_female[comor] = (float(n_comor_female[comor])*100)/float(len(df_))
                print(f"The % of female participants with {comor} is {percentage_comor_female[comor]}")