In [14]:
import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('Agg')
%matplotlib inline
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt
import math

from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm
import copy
import os
import shutil

%matplotlib inline
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt


In [15]:
def add_time_difference(df_source, i):
    assignment = {}
    assignment[f"time_difference.0.{i}"] = (pd.to_datetime(df_source[f"date_visit.{i}"]) - pd.to_datetime(df_source["date_visit.0"])).dt.total_seconds() / (24 * 3600)
    df_source = df_source.assign(**assignment)
    return df_source

In [16]:
# function to add log features 
def add_logs(df_, log_features):
    assignment = {}
    log_cols = []

    for log_feature in log_features:
        if log_feature == "f.eid" or "date_visit" in log_feature:
            continue
        if log_feature in df_.columns:
            print(log_feature)
            new_fld = f"log_{log_feature}"
            assignment[new_fld] = np.log1p(df_[log_feature])
            log_cols = log_cols + [new_fld]
            print(new_fld)

    df_ = df_.assign(**assignment)
    return df_, log_cols

In [17]:
# function to quadr features 
def add_squares(df_, q_features):
    assignment = {}
    quadr_cols = []
    
    for q_feat in q_features:
        if q_feat == "f.eid"  or "date_visit" in q_feat:
            continue
        if q_feat in df_.columns:
            new_fld = f"quadr_{q_feat}"
            assignment[new_fld] = np.square(df_[q_feat])
            quadr_cols = quadr_cols + [new_fld]
            print(new_fld)
    
    

    df_ = df_.assign(**assignment)
    return df_, quadr_cols

In [18]:
# function which adds the column with the RV scores to the data set
def adding_rv_scores(df_source, mem_experiment_full, repeat):
    assignment = {}
    weight_mem = 1
    if "pairs_matching" in mem_experiment_full:
        weight_mem = -1
        
    sum_mem_0 = df_source[f"{mem_experiment_full}.0"].sum()
    mem_mean_0 = df_source[f"{mem_experiment_full}.0"].mean()
    mem_std_0 = df_source[f"{mem_experiment_full}.0"].std()
    assignment[f'z_{mem_experiment_full}.0'] = (df_source[f"{mem_experiment_full}.0"] - mem_mean_0)/mem_std_0
    
    sum_mem_repeat = df_source[f"{mem_experiment_full}.{repeat}"].sum()
    mem_mean_repeat = df_source[f"{mem_experiment_full}.{repeat}"].mean()
    mem_std_repeat = df_source[f"{mem_experiment_full}.{repeat}"].std()
    assignment[f'z_{mem_experiment_full}.{repeat}'] = (df_source[f"{mem_experiment_full}.{repeat}"] - mem_mean_0)/mem_std_0 
    
    sum_sg_rt_mean_0 = df_source["snap_game_true_pos_rt_avrg.0"].sum()
    sg_rt_mean_0 = df_source["snap_game_true_pos_rt_avrg.0"].mean()
    sg_rt_std_0 = df_source["snap_game_true_pos_rt_avrg.0"].std()
    assignment['z_snap_game_true_pos_rt_avrg.0'] = (df_source["snap_game_true_pos_rt_avrg.0"] - sg_rt_mean_0)/sg_rt_std_0

    sum_sg_rt_mean_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].sum()
    sg_rt_mean_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].mean() 
    sg_rt_std_repeat = df_source[f"snap_game_true_pos_rt_avrg.{repeat}"].std()
    assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] = (df_source[f"snap_game_true_pos_rt_avrg.{repeat}"] - sg_rt_mean_0)/sg_rt_std_0 
    
     # generate Global_v00 = (zDSTf_v00 + zDSTb_v00 - zTMTa_v00 - zTMTb_v00) / 4
    assignment["global.0"] = (- assignment['z_snap_game_true_pos_rt_avrg.0'] + weight_mem * assignment[f'z_{mem_experiment_full}.0'])/2.0
    sum_global_0 = assignment["global.0"].sum()
    mean_global_0 = assignment["global.0"].mean()
    std_global_0 = assignment["global.0"].std()
    assignment['z_global.0'] = (assignment["global.0"] - mean_global_0)/std_global_0

    assignment[f"global.{repeat}"] = (- assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] + weight_mem * assignment[f'z_{mem_experiment_full}.{repeat}'])/2.0 
    assignment[f"z_global.{repeat}"] = (assignment[f"global.{repeat}"] - mean_global_0)/std_global_0
    
    assignment[f"z_global_change.0.{repeat}"] = assignment[f"z_global.{repeat}"] - assignment[f"z_global.{0}"]
    assignment[f"z_change_{mem_experiment_full}.0.{repeat}"] = assignment[f'z_{mem_experiment_full}.{repeat}'] - assignment[f'z_{mem_experiment_full}.0']
    assignment[f"z_change_snap_game_true_pos_rt_avrg.0.{repeat}"] = assignment[f'z_snap_game_true_pos_rt_avrg.{repeat}'] - assignment['z_snap_game_true_pos_rt_avrg.0']
    
    df_source = df_source.assign(**assignment) 
    return df_source  

In [19]:
def add_median_score(df_source, predictors, score_clmn, n_steps):
    
    assignment = {}
    assignment[f"median_{score_clmn}"] = None
    df_source = df_source.assign(**assignment)
    
    step = {}
    for clmn in predictors:
        
        step[clmn] = (df_source[clmn].max() - df_source[clmn].min())/n_steps
        
    for i in range(len(df_source)):
        print(f"moving median participant {i}")
        df_filter = copy.deepcopy(df_source)
        
        for clmn in predictors:
           
            left = df_source.at[i, clmn] - step[clmn]
            right = df_source.at[i, clmn] + step[clmn]
            df_filter = df_filter[df_filter[clmn].ge(left) & df_filter[clmn].le(right)]
        
        current_median = df_filter[score_clmn].median()
       
        df_source.at[i, f"median_{score_clmn}"] = current_median
    
    return df_source
         

In [20]:
n_samples = None
n_steps_for_median = None

memory_experiment = None

second_visit = 1

add_rv_scores = False

input_dir = "/projects/prime/ukbb/preprocessed_data_2024"
input_filenames = {
    "diet": "diet_2024_bradbury_march.csv",
    "alcohol": "alcohol_2024_maart.csv",
    "diagnoses": "diagnoses_2024.csv",
    "lifestyle": "participants_age_date_smoke_MET_2024_jan.csv",
    "medications": "participants_medications_2024_jan.csv",
    "risk_factors": "blood_count_biochemistry_pressure_maart_2024.csv",
    "sociodemo": "edu_job_marriage_2024.csv",
    "cognition": "cognitive_2024_maart.csv",
    "metrics": "body_size_measures_gender_dates_2024_jan.csv"
  }

# imputed_dir = f"/projects/prime/ukbb/preprocessed_data_2024/imputations/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1"
# path_imputed =  f"{imputed_dir}/ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1.csv"

imputed_dir = None
path_imputed = None

if n_steps_for_median is not None:
  output_dir =f"/projects/prime/ukbb/results_2024/all_cogn_0_{second_visit}_median_28_07"
else:
  output_dir =f"/projects/prime/ukbb/results_2024/all_cogn_0_{second_visit}_28_07"

if not os.path.exists(output_dir):
  os.makedirs(output_dir)

In [21]:

out_data_file = f"{output_dir}/data_with_na.csv"
#out_imputed_data_file = f"{output_dir}/data_ukbb_imputed_{memory_experiment}_sg_0_{second_visit}_rf_instance_1.csv"
out_imputed_data_file = None

In [22]:
if memory_experiment == "nm":
    memory_experiment_full_name = "num_memory_max_digits_remembered_correctly"
else:
    memory_experiment_full_name = "pairs_matching_sum_incorrect"

In [23]:

predictors = ["gender.0", f"time_difference.0.{second_visit}", 
              "Diabetes_2.0",  "Dyslipidemia.0", "Depression.0", "Hypertension.0", 
  "age_years.0", "education_level.0", "marital_status.0", "pp_smoke_catgeory.0", "MET_score.0", "waist_cm.0",  "bmi_kg_m2.0", 
  "Syst_bp.0", "Diast_bp.0", "ldl_conv.0", "hdl_conv.0", "Triglycerides_conv.0", "HbA1c_conv.0",  "Pl_glucose_conv.0",   "Albumin.0", 
  "Leukocyte_count.0", "C_reactive_protein.0",
   "oily_fish_gpday_bradbury.0", "white_fish_gpday_bradbury.0", 
   "red_meat_bradbury_gpd.0", "poultry_gpday_bradbury.0",
   "processed_meat_gpday_bradbury.0", "veg_gpday_bradbury.0",
   "fruit_gpday_bradbury.0",
    "cereals_gpday_bradbury.0", "bread_gpday_bradbury.0", "cheese_gpday_bradbury.0",
    "milk_gpday_bradbury.0", "tea_gpday_bradbury.0",
    "red_wine_gpd.0", "white_wine_gpd.0", "fortified_gpd.0", "beer_cider_gpd.0", "spirits_gpd.0",
   "aspirin.0",  "anxiety_tr.0", "pain_tr.0", "TAZD_Thiazide.0",  
   "loop_diuretics.0",  "potassium_diuretics.0", "beta_blockers.0", "calcium_antagonists.0", 
   "ARA_II_Antagonists_of_angiotensin_II_receptors.0", "IECA_Angiotensin_converting_enzyme_inhibitors.0",
   "Other_Hypotensive.0","hypochol_statins.0", "hypochol_others.0", "insulin.0", "sulfonylurea.0", "thiazolidinediones.0", "non_sulfonylurea_insulin_secretagogues.0", "metformin_category.0", "vitamins_minerals.0"]
  

In [24]:
outcome_components = ["num_memory_max_digits_remembered_correctly.0", f"num_memory_max_digits_remembered_correctly.{second_visit}",
                       "pairs_matching_sum_incorrect.0", f"pairs_matching_sum_incorrect.{second_visit}",
                      "snap_game_true_pos_rt_avrg.0", f"snap_game_true_pos_rt_avrg.{second_visit}"]


In [25]:
all_needed = ["f.eid"] + predictors + outcome_components
for i in range(1,4):
    if f"time_difference.0.{i}" in predictors: # will be calculated from dates
        all_needed.remove(f"time_difference.0.{i}")
        all_needed = all_needed + [f"date_visit.{i}"] 
        if "date_visit.0" not in all_needed:
             all_needed = all_needed + ["date_visit.0"] 

In [26]:
# load raw data set 
df_index = {}
first = True
for label, filename in input_filenames.items():
    print(label)
    full_path = f"{input_dir}/{filename}"
        
    data_columns = pd.read_table(full_path, nrows=1, sep=',').columns
        
    needed_fields = data_columns[data_columns.isin(all_needed)]
        
    if label == "medications":
        needed_fields = needed_fields.tolist() + ["alpha_glucosidase_inhibitors.0"] 
            
    df_index[label] = pd.read_table(full_path, sep=',', usecols=needed_fields, dtype=str, nrows=n_samples)
        
        
    print(len(df_index[label]))
    print(len(df_index[label].dropna(axis="rows")))
    if label == "cognition":
        for cognitive_test in ["snap_game_true_pos_rt_avrg", "pairs_matching_sum_incorrect", "num_memory_max_digits_remembered_correctly"]:
            print(cognitive_test)
            clmns = [f"{cognitive_test}.0", f"{cognitive_test}.{second_visit}"]
            print(len(df_index[label][clmns ]))
            print(len(df_index[label][clmns].dropna(axis="rows")))
        
    for clmn in needed_fields:
        if clmn == "f.eid" or clmn.startswith("date_visit"):
            continue
        df_index[label][clmn] = df_index[label][clmn].astype(float)
        
    
    if first:
        df = df_index[label]
        first = False
    else:
        df = pd.merge(df,df_index[label])
            
    print(len(df))



   

df= df.loc[df["alpha_glucosidase_inhibitors.0"] != 1]
print("removed all participants with alpha_glucosidase_inhibitors.0 == 1:")  
df = df.drop("alpha_glucosidase_inhibitors.0", axis = 1)
print(len(df))
    
    
    # add to both data data sets time difference column
for i in range(1,4):
    if f"time_difference.0.{i}" in predictors:
        df = add_time_difference(df, i)
    
log_feat = copy.deepcopy(df.columns)
qu_feat = copy.deepcopy(df.columns)
df, log_cols =  add_logs(df, log_feat)
df, quad_cols =  add_squares(df, qu_feat)
    
    
if memory_experiment == "pm":
    memory_experiment = f"log_{memory_experiment}"
    memory_experiment_full_name = f"log_{memory_experiment_full_name}"

if add_rv_scores:
    df = adding_rv_scores(df, memory_experiment_full_name, second_visit)
    outcomes = [f"z_global.{second_visit}", f"z_change_{memory_experiment_full_name}.0.{second_visit}",  f"z_change_snap_game_true_pos_rt_avrg.0.{second_visit}"]
    df.reset_index(drop=True, inplace=True)
else: # just ordinary progression and speed
    
    
    for cognitive_test in ["snap_game_true_pos_rt_avrg", "log_pairs_matching_sum_incorrect", "num_memory_max_digits_remembered_correctly"]:
        assignment = {}
        progression_clmn = f"{cognitive_test}.change.0.{second_visit}"
        speed_clmn = f"{cognitive_test}.speed.0.{second_visit}"
        assignment[progression_clmn] = df[f"{cognitive_test}.{second_visit}"] -df[f"{cognitive_test}.0"] 
        assignment[speed_clmn] = (assignment[progression_clmn] * 365.25) / df[f"time_difference.0.{second_visit}"]    
        df = df.assign(**assignment)
    
        df.reset_index(drop=True, inplace=True)    
   
   
    
    
## ADD MEDIANS

if n_steps_for_median is not None:
        # remove NA from the check up data set 
    df = df.dropna(axis="rows")
    print(f"cleaned check up db has {len(df)} rows")
    for outcome in outcomes:
        df = add_median_score(df, predictors, outcome, n_steps_for_median)
        
df.to_csv(out_data_file, sep=',')


diet
502250
245456
502250
alcohol
502250
379704
502250
diagnoses
502250
502250
502250
lifestyle
502250
16298
502250
medications
502271
502271
502250
risk_factors
502250
382777
502250
sociodemo
502250
405002
502250
cognition
502250
0
snap_game_true_pos_rt_avrg
502250
20182
pairs_matching_sum_incorrect
502250
19379
num_memory_max_digits_remembered_correctly
502250
0
502250
metrics
502250
20293
502250
removed all participants with alpha_glucosidase_inhibitors.0 == 1:
502204
gender.0
log_gender.0
red_meat_bradbury_gpd.0
log_red_meat_bradbury_gpd.0
fruit_gpday_bradbury.0
log_fruit_gpday_bradbury.0
veg_gpday_bradbury.0
log_veg_gpday_bradbury.0
milk_gpday_bradbury.0
log_milk_gpday_bradbury.0
bread_gpday_bradbury.0
log_bread_gpday_bradbury.0
oily_fish_gpday_bradbury.0
log_oily_fish_gpday_bradbury.0
white_fish_gpday_bradbury.0
log_white_fish_gpday_bradbury.0
processed_meat_gpday_bradbury.0
log_processed_meat_gpday_bradbury.0
poultry_gpday_bradbury.0
log_poultry_gpday_bradbury.0
cheese_gpday_bra

In [27]:
# load imputed data set 
if imputed_dir is not None:
    
    df_imputed = pd.read_table(path_imputed, nrows=n_samples, sep=',')

    # remove NA from the check up data set 
    #df = df.dropna(axis="rows")
    #print(f"cleaned check up db has {len(df)} rows")

    if "alpha_glucosidase_inhibitors.0" in df_imputed.columns:
        df_imputed= df_imputed.loc[df_imputed["alpha_glucosidase_inhibitors.0"] != 1]
        print("removed all participants with alpha_glucosidase_inhibitors.0 == 1:")  
        df_imputed = df_imputed.drop("alpha_glucosidase_inhibitors.0", axis = 1)
        print(len(df_imputed))
        
        
    # add to both data data sets time difference column
    for i in range(1,4):
        if f"time_difference.0.{i}" in predictors:
            df_imputed = add_time_difference(df_imputed, i)
    
    log_feat = copy.deepcopy(df_imputed.columns)
    qu_feat = copy.deepcopy(df_imputed.columns)
    df_imputed, log_cols =  add_logs(df_imputed, log_feat)
    df_imputed, quad_cols =  add_squares(df_imputed, qu_feat)
    
    
    if memory_experiment == "pm" and "log_" not in memory_experiment:
            memory_experiment = f"log_{memory_experiment}"
            memory_experiment_full_name = f"log_{memory_experiment_full_name}"

    df_imputed = adding_rv_scores(df_imputed, memory_experiment_full_name, second_visit)
   
    outcomes = [f"z_global.{second_visit}", f"z_change_{memory_experiment_full_name}.0.{second_visit}",  f"z_change_snap_game_true_pos_rt_avrg.0.{second_visit}"]
    df_imputed.reset_index(drop=True, inplace=True)
    
  
    if n_steps_for_median is not None:
        for outcome in outcomes:
            df_imputed = add_median_score(df_imputed, predictors, outcome, n_steps_for_median)
    df_imputed.to_csv(out_imputed_data_file, sep=',')



In [28]:
# describe population
def describe_population(df_, descr): 
    print(descr)
    comorbidity = ["Diabetes_2.0",  "Dyslipidemia.0", "Depression.0", "Hypertension.0"]
    n_comor = {}
    percentage_comor = {}
    n_comor_female = {}
    percentage_comor_female = {}
    for comor in comorbidity:
        if comor in df_.columns: 
            n_comor[comor] = df_[comor].sum()
            percentage_comor[comor] = (float(n_comor[comor])*100)/float(len(df_))
            print(f"The % of participants with {comor} is {percentage_comor[comor]}")
            if "gender.0" in df_.columns:
                n_comor_female[comor] = (df_[comor] + df_["gender.0"]).eq(2.0).sum()
                percentage_comor_female[comor] = (float(n_comor_female[comor])*100)/float(len(df_))
                print(f"The % of female participants with {comor} is {percentage_comor_female[comor]}")

In [29]:
describe_population(df, "not imputed")
if imputed_dir is not None:
    describe_population(df_imputed, "imputed")

not imputed
The % of participants with Diabetes_2.0 is 4.974074280571242
The % of female participants with Diabetes_2.0 is 1.8596028705466305
The % of participants with Dyslipidemia.0 is 13.83800208680138
The % of female participants with Dyslipidemia.0 is 5.855588565602822
The % of participants with Depression.0 is 6.058494157752627
The % of female participants with Depression.0 is 3.9444130273753295
The % of participants with Hypertension.0 is 27.64912266728262
The % of female participants with Hypertension.0 is 13.1844827998184


In [30]:
print(len(df[[f"snap_game_true_pos_rt_avrg.change.0.{second_visit}"]].dropna(axis="rows")))
print(len(df[["snap_game_true_pos_rt_avrg.0", f"snap_game_true_pos_rt_avrg.{second_visit}", 
              f"snap_game_true_pos_rt_avrg.change.0.{second_visit}"]].dropna(axis="rows")))

print(len(df[[f"log_pairs_matching_sum_incorrect.change.0.{second_visit}"]].dropna(axis="rows")))
print(len(df[["log_pairs_matching_sum_incorrect.0", f"log_pairs_matching_sum_incorrect.{second_visit}", 
              f"log_pairs_matching_sum_incorrect.change.0.{second_visit}"]].dropna(axis="rows")))

print(len(df[[f"num_memory_max_digits_remembered_correctly.change.0.{second_visit}"]].dropna(axis="rows")))
print(len(df[[f"num_memory_max_digits_remembered_correctly.0", f"num_memory_max_digits_remembered_correctly.{second_visit}", 
              f"num_memory_max_digits_remembered_correctly.change.0.{second_visit}"]].dropna(axis="rows")))


20180
20180
19378
19378
0
0


In [31]:
print(len(df[["snap_game_true_pos_rt_avrg.0", f"snap_game_true_pos_rt_avrg.{second_visit}", 
             f"snap_game_true_pos_rt_avrg.speed.0.{second_visit}"]].dropna(axis="rows")))
print(len(df[[f"log_pairs_matching_sum_incorrect.speed.0.{second_visit}"]].dropna(axis="rows")))
print(len(df[[f"num_memory_max_digits_remembered_correctly.speed.0.{second_visit}"]].dropna(axis="rows")))

KeyError: "['snap_game_true_pos_rt_avrg.speed.0.'] not in index"