In [None]:
import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('Agg')
%matplotlib inline
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt
import math
field_id = "f.eid"
map_ukbb_freindly_name = {
    20023: "snap_game_true_pos_rt_avrg",
    401: "snap_game_index_card_a",
    402: "snap_game_index_card_b",
    403: "snap_game_number_button_presses",
    404: "snap_game_ms_first_press",
    399: "pairs_matching_incorrect_in_round",
    400: "pairs_matching_completion_time_round",
    20132: "online_pairs_matching_incorrect_in_round",
    # 20133: "online_pairs_matching_completion_time_round",
    20134: "date_online_pairs_comletion",
    4282: "num_memory_max_digits_remembered_correctly",
    4285: "num_memory_completion_time",
    53: "date_visit",
    21022: "age_visit",
    6138: "qualifications",
    
}
'''1	College or University degree
2	A levels/AS levels or equivalent
3	O levels/GCSEs or equivalent
4	CSEs or equivalent
5	NVQ or HND or HNC or equivalent
6	Other professional qualifications eg: nursing, teaching
-7	None of the above
-3	Prefer not to answer '''

# version https://academic.oup.com/ije/article/49/1/246/5470096
'''(College or University degree, vocational qualifications (other professional qualifications/NVQ or HND or HNC), 
optional national exams at ages 17 to 18 years (A levels/AS levels), 
national exams at age 16 years (O levels/GCSEs/CSEs), none of the above, unknown (prefer not to answer))'''
education_level_map = {
    1 : 3,
    2: 2,
    3: 1,
    4: 1,
    5: 3,
    6: 3
}


In [None]:
''' 
Dietary article
Participants who completed the repeat assessment centre visit or at least one 24-h dietary assessment were more likely to have a university degree or vocational qualification and slightly less likely to smoke, compared with the full cohort.

'''

n_samples = None# limit #samples for debug
data_filename = "/projects/prime/ukbb/23_01_2024_selected.tab"
output_dir = "/projects/prime/ukbb/preprocessed_data_2024/"
out_file = output_dir + "cognitive_2024_maart.csv"

withdrawn_participants_file = '/projects/prime/ukbb/withdraw6244_207_20231205.txt'
# data_filename = '~/PRIME/python_raw_cognitive_tmp_distorted.csv'
withdrawn = pd.read_csv(withdrawn_participants_file, header=None)
withdrawn_col = list(withdrawn.iloc[:, 0])

### find out the list of all columns
data_columns = pd.read_table(data_filename, nrows=1, sep='\t').columns

# make a list of columns needed
needed_fields = ["f.eid"]
for ukbb_field in map_ukbb_freindly_name:
    print(ukbb_field)
    current_list = [col for col in data_columns if col.startswith("f." +str(ukbb_field))]
    needed_fields = needed_fields + current_list
    print(current_list)

In [None]:
print("start reading the table .... ")

df = pd.read_table(data_filename, sep='\t', usecols=needed_fields, dtype=str, nrows=n_samples)

print("finished reading the table .... ")
print("# participants before checking entrance age")
print(len(df))
df = df[~df["f.21022.0.0"].isnull()]
print("# participants after checking entrance age but yet with withdrawn ")
print(len(df))

df = df[~df[field_id].isin(withdrawn_col)]

print("# participants without withdrawn ")
print(len(df))

In [None]:
map_fields = {}
for i in range(4): # over instances (visits)
    for ukbb_field, friendly_name in map_ukbb_freindly_name.items():
        
        if i>0 and friendly_name.startswith("age_visit"):
            continue
        
        if i>0 and friendly_name.startswith("online"):
            continue
        
        if friendly_name.startswith("qualifications"):
            for j in range(6): # multiple answers were possible
                source=f"f.{str(ukbb_field)}.{i}.{j}"
                df[source] = df[source].astype(float)
                target=f"{friendly_name}.{i}.{j}"
                map_fields[source] =target 
            continue
        
        if friendly_name.startswith("pairs_matching"):
            for j in [1,2,3]: # three rounds, but we will consider the sume of errors of 1 and 2, and just the first round as alternative
                source=f"f.{str(ukbb_field)}.{i}.{j}"
                df[source] = df[source].astype(float)
                target = f"{friendly_name}_{j}.{i}" 
                map_fields[source] =target 
            continue
        
        if friendly_name.startswith("online_pairs_matching"):
            for j in [0,1,2]: # three rounds, but we will consider the sume of errors of 1 and 2, and just the first round as alternative
                source=f"f.{str(ukbb_field)}.{i}.{j}"
                df[source] = df[source].astype(float)
                target = f"{friendly_name}_{j+1}.{i}" 
                map_fields[source] =target 
            continue
        
        source = f"f.{str(ukbb_field)}.{i}.0"
        if not friendly_name.startswith("date_visit") and not friendly_name.startswith("date_online_pairs_comletion"):
            df[source] = df[source].astype(float)
        target = f"{friendly_name}.{i}" 
        map_fields[source] = target
       
        
print(map_fields)
df.rename(columns=map_fields, inplace=True)
print(df.columns)



In [None]:
# df = df[df["age_visit.0"].ge(48)]
for i in range(4):
    for j in range(1,4):
        df.loc[df[f"pairs_matching_completion_time_round_{j}.{i}"].le(0), f"pairs_matching_incorrect_in_round_{j}.{i}"] = np.NaN

'''
for j in range(1,4):
        df.loc[df[f"online_pairs_matching_completion_time_round_{j}.{i}"].le(0), f"online_pairs_matching_incorrect_in_round_{j}.{i}"] = np.NaN
'''

In [None]:
# make pairs matching sum incorrect for two rounds
assignment = {}
    
for i in range(4):
    assignment[f"pairs_matching_sum_incorrect.{i}"] = df[f"pairs_matching_incorrect_in_round_1.{i}"] + df[f"pairs_matching_incorrect_in_round_2.{i}"]
    
df = df.assign(**assignment)        

In [None]:
# make education levels
assignment = {}
for i in range(4):
    # initialise education level
    target_fld_name = f"education_level.{i}"
    src_fld_name = f"qualifications.{i}.0"
    assignment[target_fld_name] = df[src_fld_name].map(education_level_map)
    df = df.assign(**assignment) 
    for j in range(1,6):
        src_fld_name = f"qualifications.{i}.{j}"
        df.loc[~df[target_fld_name].isnull() & ~df[src_fld_name].map(education_level_map).isnull(), target_fld_name] = np.maximum(df[target_fld_name], df[src_fld_name].map(education_level_map))
        df.loc[df[target_fld_name].isnull(), target_fld_name] = df[src_fld_name].map(education_level_map) 
                                                                                                                                
        
        
    
  

In [None]:
assignment = {}
assignment["time_difference.0.1"] = (pd.to_datetime(df["date_visit.1"]) - pd.to_datetime(df["date_visit.0"])).dt.total_seconds() / (24 * 3600)
assignment["time_difference.0.2"] = (pd.to_datetime(df["date_visit.2"]) - pd.to_datetime(df["date_visit.0"])).dt.total_seconds() / (24 * 3600)
assignment["time_difference.0.3"] = (pd.to_datetime(df["date_visit.3"]) - pd.to_datetime(df["date_visit.0"])).dt.total_seconds() / (24 * 3600)
df = df.assign(**assignment) 

In [None]:
df.loc[df["num_memory_max_digits_remembered_correctly.0"].lt(0), "num_memory_max_digits_remembered_correctly.0"] = np.NaN
df.loc[df["num_memory_max_digits_remembered_correctly.2"].lt(0), "num_memory_max_digits_remembered_correctly.2"] = np.NaN
df.loc[df["num_memory_max_digits_remembered_correctly.3"].lt(0), "num_memory_max_digits_remembered_correctly.3"] = np.NaN

In [None]:

assignment = {}
for i in range(1,4):
    # per round
    for j in range(1,4):
        assignment[f"pairs_matching_change_round_{j}.0.{i}"] = df[f"pairs_matching_incorrect_in_round_{j}.{i}"] - df[f"pairs_matching_incorrect_in_round_{j}.0"]
        assignment[f"pairs_matching_change_rate_round_{j}.0.{i}"] = (assignment[f"pairs_matching_change_round_{j}.0.{i}"] * 365.25) / df[f"time_difference.0.{i}"] 

    # for the sum incorrect
    assignment[f"pairs_matching_sum_incorrect_change.0.{i}"] = df[f"pairs_matching_sum_incorrect.{i}"] - df["pairs_matching_sum_incorrect.0"]     
    assignment[f"pairs_matching_sum_incorrect_change_rate.0.{i}"] = (assignment[f"pairs_matching_sum_incorrect_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
    # snap game
    assignment[f"snap_game_true_pos_rt_avrg_change.0.{i}"] = df[f"snap_game_true_pos_rt_avrg.{i}"] - df["snap_game_true_pos_rt_avrg.0"]     
    assignment[f"snap_game_true_pos_rt_avrg_change_rate.0.{i}"] = (assignment[f"snap_game_true_pos_rt_avrg_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
    # numeric memory
    assignment[f"numeric_memory_change.0.{i}"] = df[f"num_memory_max_digits_remembered_correctly.{i}"] - df["num_memory_max_digits_remembered_correctly.0"]
    assignment[f"numeric_memory_change_rate.0.{i}"] = (assignment[f"numeric_memory_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
df = df.assign(**assignment)



In [None]:

assignment = {}
for i in range(1,4):
    # per round
    for j in range(1,4):
        assignment[f"pairs_matching_change_round_{j}.0.{i}"] = df[f"pairs_matching_incorrect_in_round_{j}.{i}"] - df[f"pairs_matching_incorrect_in_round_{j}.0"]
        assignment[f"pairs_matching_change_rate_round_{j}.0.{i}"] = (assignment[f"pairs_matching_change_round_{j}.0.{i}"] * 365.25) / df[f"time_difference.0.{i}"] 

    # for the sum incorrect
    assignment[f"pairs_matching_sum_incorrect_change.0.{i}"] = df[f"pairs_matching_sum_incorrect.{i}"] - df["pairs_matching_sum_incorrect.0"]     
    assignment[f"pairs_matching_sum_incorrect_change_rate.0.{i}"] = (assignment[f"pairs_matching_sum_incorrect_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
    # snap game
    assignment[f"snap_game_true_pos_rt_avrg_change.0.{i}"] = df[f"snap_game_true_pos_rt_avrg.{i}"] - df["snap_game_true_pos_rt_avrg.0"]     
    assignment[f"snap_game_true_pos_rt_avrg_change_rate.0.{i}"] = (assignment[f"snap_game_true_pos_rt_avrg_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
    # numeric memory
    assignment[f"numeric_memory_change.0.{i}"] = df[f"num_memory_max_digits_remembered_correctly.{i}"] - df["num_memory_max_digits_remembered_correctly.0"]
    assignment[f"numeric_memory_change_rate.0.{i}"] = (assignment[f"numeric_memory_change.0.{i}"]  * 365.25) / df[f"time_difference.0.{i}"] 
    
df = df.assign(**assignment)



In [None]:
print(df["pairs_matching_sum_incorrect_change.0.2"])
print(df["numeric_memory_change.0.2"])
print(df["snap_game_true_pos_rt_avrg_change.0.2"])

In [None]:
# online: we need for learning effect for the second visit
assignment = {"learning_effect_prior.2": np.NaN, "online_before.2": None}

df = df.assign(**assignment)
df.loc[df[f"pairs_matching_incorrect_in_round_1.2"].notnull(), "learning_effect_prior.2"] = 0
df["online_before.2"] = (pd.to_datetime(df["date_visit.2"]) - pd.to_datetime(df["date_online_pairs_comletion.0"])).dt.total_seconds().ge(0)
print(df["online_before.2"])
for j in range(1,4):
    df.loc[df[f"pairs_matching_incorrect_in_round_{j}.1"].notnull(), "learning_effect_prior.2"] = df["learning_effect_prior.2"] + 1
    df.loc[df["online_before.2"] & df[f"online_pairs_matching_incorrect_in_round_{j}.0"].notnull(), "learning_effect_prior.2"] = df["learning_effect_prior.2"] + 1

print(df["learning_effect_prior.2"])

In [None]:
df.to_csv(out_file, sep=',')

In [None]:

df_checkup_nm = df[~df["num_memory_max_digits_remembered_correctly.0"].isnull() &
                    ~df["num_memory_max_digits_remembered_correctly.2"].isnull()]
df_checkup_nm_sg = df_checkup_nm[~df_checkup_nm["snap_game_true_pos_rt_avrg.0"].isnull() &
                                    ~df_checkup_nm["snap_game_true_pos_rt_avrg.2"].isnull()]
df_checkup_nm_sg_edu =df_checkup_nm_sg[~df_checkup_nm_sg["education_level.0"].isnull()]
print(f"participants with nm and sg both visits with edu defined: {len(df_checkup_nm_sg_edu)}")

In [None]:
# all three cognitive tests
df_checkup_nm_sg_pm_1_edu = df_checkup_nm_sg_edu[~df_checkup_nm_sg_edu["pairs_matching_incorrect_in_round_1.0"].isnull() &
                                    ~df_checkup_nm_sg_edu["pairs_matching_incorrect_in_round_1.2"].isnull()]
print(f"participants with nm, pm_1,  and sg three visits with edu defined: {len(df_checkup_nm_sg_pm_1_edu)}")


In [None]:
df_checkup_nm_sg_pm_edu = df_checkup_nm_sg_edu[~df_checkup_nm_sg_edu["pairs_matching_sum_incorrect.0"].isnull() &
                                    ~df_checkup_nm_sg_edu["pairs_matching_sum_incorrect.2"].isnull()]
print(f"participants with nm, pm,  and sg both visits with edu defined: {len(df_checkup_nm_sg_pm_edu)}")

In [None]:
df_checkup_pm_1 = df[~df["pairs_matching_incorrect_in_round_1.0"].isnull() &
                    ~df["pairs_matching_incorrect_in_round_1.2"].isnull()]
df_checkup_pm_1_sg = df_checkup_pm_1[~df_checkup_pm_1["snap_game_true_pos_rt_avrg.0"].isnull() &
                                    ~df_checkup_pm_1["snap_game_true_pos_rt_avrg.2"].isnull()]

df_checkup_pm_1_sg_edu =df_checkup_pm_1_sg[~df_checkup_pm_1_sg["education_level.0"].isnull()]
print(f"participants with pm_1 and sg both visits with edu defined: {len(df_checkup_pm_1_sg_edu)}")

In [None]:
# visit 0 and 1
df_01_checkup_pm_1 = df[~df["pairs_matching_incorrect_in_round_1.0"].isnull() &
                    ~df["pairs_matching_incorrect_in_round_1.1"].isnull()]
df_01_checkup_pm_1_sg = df_01_checkup_pm_1[~df_01_checkup_pm_1["snap_game_true_pos_rt_avrg.0"].isnull() &
                                    ~df_01_checkup_pm_1["snap_game_true_pos_rt_avrg.1"].isnull()]

df_01_checkup_pm_1_sg_edu =df_01_checkup_pm_1_sg[~df_01_checkup_pm_1_sg["education_level.0"].isnull()]
print(f"participants with pm_1 and sg both visits 0 and 1 with edu defined: {len(df_01_checkup_pm_1_sg_edu)}")

In [None]:
df_checkup_pm = df[~df["pairs_matching_sum_incorrect.0"].isnull() &
                    ~df["pairs_matching_sum_incorrect.2"].isnull()]
df_checkup_pm_sg = df_checkup_pm[~df_checkup_pm["snap_game_true_pos_rt_avrg.0"].isnull() &
                                    ~df_checkup_pm["snap_game_true_pos_rt_avrg.2"].isnull()]
df_checkup_pm_sg_edu =df_checkup_pm_sg[~df_checkup_pm_sg["education_level.0"].isnull()]
print(f"participants with pm and sg both visits with edu defined: {len(df_checkup_pm_sg_edu)}")

In [None]:
df_01_checkup_pm = df[~df["pairs_matching_sum_incorrect.0"].isnull() &
                    ~df["pairs_matching_sum_incorrect.1"].isnull()]


In [None]:
df_01_checkup_pm_sg = df_01_checkup_pm[~df_01_checkup_pm["snap_game_true_pos_rt_avrg.0"].isnull() &
                                    ~df_01_checkup_pm["snap_game_true_pos_rt_avrg.1"].isnull()]


In [None]:
df_01_checkup_pm_sg_edu =df_01_checkup_pm_sg[~df_01_checkup_pm_sg["education_level.0"].isnull()]

print(f"participants with pm and sg both visits 0 and 1 with edu defined: {len(df_checkup_pm_sg_edu)}")

In [None]:
df_checkup_nm_sg_pm_1_edu["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2 for all nm, pm_1, sg, with edu.0 defined')
plt.show()

df_checkup_nm_sg_pm_edu["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2 for all nm, pm, sg with edu.0 defined')
plt.show()

In [None]:

df_checkup_nm_sg_edu["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2 for all nm, sg with edu.0 defined')
plt.show()

In [None]:
df_checkup_pm_1_sg_edu["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2 for all pm_1, sg, edu defined')
plt.show()

In [None]:
df_checkup_pm_sg_edu["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2 for all pm, sg, edu defined ')
plt.show()

In [None]:
df_01_checkup_pm_1_sg_edu["time_difference.0.1"].plot(kind='hist', title='Density time_difference.0.1 for all pm_1, sg, edu defined ')
plt.show()
df_01_checkup_pm_sg_edu["time_difference.0.1"].plot(kind='hist', title='Density time_difference.0.1 for all pm, sg, edu defined ')
plt.show()

In [None]:
def show_trends(dataset, predictor, colname_, step, name_data_set, cat = False):
    min_predictor = np.min(dataset[predictor])
    max_predictor = np.max(dataset[predictor])
    if cat:
        lower_bound = min_predictor
        upper_bound = max_predictor + 1
    else:
        lower_bound = min_predictor + step
        upper_bound = max_predictor 
    row_list_median = []
    row_list_mean = []
    val = lower_bound
    while val < upper_bound:
        df_current = pd.DataFrame()
        if cat:
          df_current = dataset[dataset[predictor].eq(val)] 
        else:
            df_current = dataset[dataset[predictor].ge(val-step) &
                                                    dataset[predictor].le(val+step)]
           
        if len(df_current) == 0:
            val = val + step 
            continue
        row_list_median.append([val, np.median(df_current[colname_])])
        row_list_mean.append([val, np.mean(df_current[colname_])])
        val = val + step 

    matrix_median = np.array(row_list_median)
    matrix_mean = np.array(row_list_mean)
    print("******")
    print("Here is the report on ")
    print(name_data_set)
    
    if cat:
        
        plt.scatter(matrix_median[:,0], matrix_median[:,1], label = "median")
        plt.scatter(matrix_mean[:,0], matrix_mean[:,1], label = "mean")
    else:
        plt.plot(matrix_median[:,0], matrix_median[:,1], label = "median")
        plt.plot(matrix_mean[:,0], matrix_mean[:,1], label = "mean")
    plt.xlabel(predictor)
    plt.ylabel(colname_)
    plt.legend()
    plt.show()
    
    
    print(f"sample size {len(dataset)}")
    std_dev = np.std(dataset[colname_])
    variance = np.var(dataset[colname_])
    data_range = np.ptp(dataset[colname_])  # Peak to peak (max-min)
    iqr = np.subtract(*np.percentile(dataset[colname_], [75, 25]))  # 75th percentile - 25th percentile
    # cv = std_dev / np.mean(dataset[colname_])
    print(f"Standard Deviation: {std_dev}")
    print(f"Variance: {variance}")
    print(f"Range: {data_range}")
    print(f"IQR: {iqr}")
    # print(f"Coefficient of Variation: {cv}")
    

In [None]:
print(df.columns)

In [None]:
# all thre cognitive tests, pm round 1
print('****')
print("Change all thre cognitive tests, pm round 1")

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "numeric_memory_change.0.2", 1, "nm_sg_pm_1_edu")


In [None]:
show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "numeric_memory_change.0.2", 30, "nm_sg_pm_1_edu")


In [None]:
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "numeric_memory_change_rate.0.2", 30, "nm_sg_pm_1_edu")


In [None]:
# show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "test", 30, "nm_sg_pm_1_edu")


In [None]:
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "numeric_memory_change.0.2", 1, "nm_sg_pm_1_edu", True)


In [None]:

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change.0.2", 30, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "nm_sg_pm_1_edu", True)

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "pairs_matching_change_round_1.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "pairs_matching_change_round_1.0.2", 30, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "pairs_matching_change_round_1.0.2", 1, "nm_sg_pm_1_edu", True)

In [None]:
print('****')
print(" Speed all thre cognitive tests, pm round 1")

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "numeric_memory_change_rate.0.2", 30, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_pm_1_edu", True)

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change_rate.0.2", 30, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "nm_sg_pm_1_edu", True)


In [None]:

show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0", "pairs_matching_change_rate_round_1.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "time_difference.0.2", "pairs_matching_change_rate_round_1.0.2", 30, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_1_edu, "education_level.0", "pairs_matching_change_rate_round_1.0.2", 1, "nm_sg_pm_1_edu", True)

In [None]:
# all thre cognitive tests, pm 2 rounds

print('****')
print("Change all thre cognitive tests, pm 2 rounds")

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "numeric_memory_change.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "numeric_memory_change.0.2", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "numeric_memory_change.0.2", 1, "nm_sg_pm_edu", True)

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change.0.2", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "nm_sg_pm_edu", True)

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "pairs_matching_sum_incorrect_change.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "pairs_matching_sum_incorrect_change.0.2", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "pairs_matching_sum_incorrect_change.0.2", 1, "nm_sg_pm_edu", True)


In [None]:
# all thre cognitive tests, pm 2 rounds

print('****')
print("Speed all thre cognitive tests, pm 2 rounds")

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "numeric_memory_change_rate.0.2", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "numeric_memory_change.0.2", 1, "nm_sg_pm_edu", True)


In [None]:

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change_rate.0.2", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "nm_sg_pm_edu", True)


In [None]:

show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0", "pairs_matching_sum_incorrect_change_speed", 1, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "time_difference.0.2", "pairs_matching_sum_incorrect_change_speed", 30, "nm_sg_pm_edu")
show_trends(df_checkup_nm_sg_pm_edu, "education_level.0", "pairs_matching_sum_incorrect_change_speed", 1, "nm_sg_pm_edu", True)

In [None]:


show_trends(df_checkup_nm_sg_edu, "age_visit.0", "numeric_memory_change.0.2", 1, "nm_sg_with_edu")
show_trends(df_checkup_nm_sg_edu, "time_difference.0.2", "numeric_memory_change.0.2", 30, "nm_sg_with_edu")
show_trends(df_checkup_nm_sg_edu, "education_level.0", "numeric_memory_change.0.2", 1, "nm_sg_with_edu", True)

show_trends(df_checkup_nm_sg_edu, "age_visit.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_with_edu")
show_trends(df_checkup_nm_sg_edu, "time_difference.0.2", "numeric_memory_change_rate.0.2", 30, "nm_sg_with_edu")
show_trends(df_checkup_nm_sg_edu, "education_level.0", "numeric_memory_change_rate.0.2", 1, "nm_sg_with_edu", True)



In [None]:
print('****')
print("Change & Speed pm_1 sg edu")

show_trends(df_checkup_pm_1_sg_edu, "age_visit.0", "pairs_matching_change_round_1.0.2", 1, "pm_1_sg")
show_trends(df_checkup_pm_1_sg_edu, "time_difference.0.2", "pairs_matching_change_round_1.0.2", 30, "pm_1_sg")
show_trends(df_checkup_pm_1_sg_edu, "education_level.0", "pairs_matching_change_round_1.0.2", 1, "pm_1_sg", True)
show_trends(df_checkup_pm_1_sg_edu, "age_visit.0","time_difference.0.2", 1, "pm_1_sg_edu")

show_trends(df_checkup_pm_1_sg_edu, "age_visit.0", "pairs_matching_change_rate_round_1.0.2", 1, "pm_1_sg")
show_trends(df_checkup_pm_1_sg_edu, "time_difference.0.2", "pairs_matching_change_rate_round_1.0.2", 30, "pm_1_sg")
show_trends(df_checkup_pm_1_sg_edu, "education_level.0", "pairs_matching_change_rate_round_1.0.2", 1, "pm_1_sg", True)

In [None]:
print('****')
print("Change & Speed pm_1 sg edu")

show_trends(df_checkup_pm_1_sg_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "pm_1_sg_edu")
show_trends(df_checkup_pm_1_sg_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change.0.2", 30, "pm_1_sg_edu")
show_trends(df_checkup_pm_1_sg_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change.0.2", 1, "pm_1_sg_edu", True)

show_trends(df_checkup_pm_1_sg_edu, "age_visit.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "pm_1_sg_edu")
show_trends(df_checkup_pm_1_sg_edu, "time_difference.0.2", "snap_game_true_pos_rt_avrg_change_rate.0.2", 30, "pm_1_sg_edu")
show_trends(df_checkup_pm_1_sg_edu, "education_level.0", "snap_game_true_pos_rt_avrg_change_rate.0.2", 1, "pm_1_sg_edu", True)


In [None]:

print('****')
print("Change & Speed pm sg")
show_trends(df_checkup_pm_sg,"age_visit.0", "pairs_matching_sum_incorrect_change.0.2", 1, "pm_sg")
show_trends(df_checkup_pm_sg, "time_difference.0.2", "pairs_matching_sum_incorrect_change.0.2", 30, "pm_sg")
show_trends(df_checkup_pm_sg, "education_level.0", "pairs_matching_sum_incorrect_change.0.2", 1, "pm_sg", True)

show_trends(df_checkup_pm_sg,"age_visit.0", "pairs_matching_sum_incorrect_change_rate.0.2", 1, "pm_sg")
show_trends(df_checkup_pm_sg, "time_difference.0.2", "pairs_matching_sum_incorrect_change_rate.0.2", 30, "pm_sg")
show_trends(df_checkup_pm_sg, "education_level.0", "pairs_matching_sum_incorrect_change_rate.0.2", 1, "pm_sg", True)

In [None]:
a = 3650
b = 3850
df_checkup_nm_sg_a_b = df_checkup_nm_sg[df_checkup_nm_sg["time_difference.0.2"].ge(3500) &
                                                  df_checkup_nm_sg["time_difference.0.2"].le(3750)]
df_checkup_nm_sg_a_b["time_difference.0.2"].plot(kind='hist', title='Density time_difference.0.2')


In [None]:
print(f"a={a}")
print(f"b={b}")
show_trends(df_checkup_nm_sg_a_b, "age_visit.0","numeric_memory_change.0.2", 1, "nm_sg_a_b")

In [None]:
print('****')
print("Change & Speed pm_1 sg visits 0 1")
show_trends(df_01_checkup_pm_1_sg,"age_visit.0", "pairs_matching_change_round_1.0.1", 1, "pm_sg")
show_trends(df_01_checkup_pm_1_sg, "time_difference.0.1", "pairs_matching_change_round_1.0.1", 30, "pm_sg")
show_trends(df_01_checkup_pm_1_sg, "education_level.0", "pairs_matching_change_round_1.0.1", 1, "pm_sg", True)

show_trends(df_01_checkup_pm_1_sg,"age_visit.0", "pairs_matching_change_rate_round_1.0.1", 1, "pm_sg")
show_trends(df_01_checkup_pm_1_sg, "time_difference.0.1", "pairs_matching_change_rate_round_1.0.1", 30, "pm_sg")
show_trends(df_01_checkup_pm_1_sg, "education_level.0", "pairs_matching_change_rate_round_1.0.1", 1, "pm_sg", True)

In [None]:
print('****')
print("Change & Speed pm sg visits 0 1")
show_trends(df_01_checkup_pm_sg,"age_visit.0", "pairs_matching_sum_incorrect_change.0.1", 1, "pm_sg")
show_trends(df_01_checkup_pm_sg, "time_difference.0.1", "pairs_matching_sum_incorrect_change.0.1", 30, "pm_sg")
show_trends(df_01_checkup_pm_sg, "education_level.0", "pairs_matching_sum_incorrect_change.0.1", 1, "pm_sg", True)

show_trends(df_01_checkup_pm_sg,"age_visit.0", "pairs_matching_sum_incorrect_change_rate.0.1", 1, "pm_sg")
show_trends(df_01_checkup_pm_sg, "time_difference.0.1", "pairs_matching_sum_incorrect_change_rate.0.1", 30, "pm_sg")
show_trends(df_01_checkup_pm_sg, "education_level.0", "pairs_matching_sum_incorrect_change_rate.0.1", 1, "pm_sg", True)

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm

In [None]:
def model_builder(df_source, predictors, outcome_var, run_lasso = True):
    print(f"outcome: {outcome_var}")
    print(df_source.columns)
    X = df_source.loc[:,predictors]
    scaler = StandardScaler()
    y = df_source[outcome_var]
    # Initialize LassoCV with 10-fold cross-validation
    if (run_lasso) {
      
        lasso_cv = LassoCV(cv=100, random_state=42, max_iter=10000)
        pipeline = make_pipeline(scaler, lasso_cv)
        pipeline.fit(X, y)
        print("Predictors:")
        print(predictors)
    
        # Optimal alpha value
        print(f"Optimal alpha: {lasso_cv.alpha_}")

        # Coefficients
        print(f"Coefficients: {lasso_cv.coef_}")

        # Intercept
        print(f"Intercept: {lasso_cv.intercept_}")
        print("Linear mode builder")

        corrected_predictors = []
        for i in range(len(predictors)):
            if lasso_cv.coef_[i] != 0:
                corrected_predictors = corrected_predictors + [predictors[i]]
   
        if len(corrected_predictors)== 0:
            print("All teh coefficients are zeros, I ignore lasso outcome")
            corrected_predictors = predictors
           
    } else {
        corrected_predictors = predictors
    }
    
    X_corr = df_source.loc[:,corrected_predictors]
    for clmn in  X_corr.columns:   
         X_corr[clmn] = (X_corr[clmn] - X_corr[clmn].mean())/X_corr[clmn].std()
    
    X_corr = sm.add_constant(X_corr)
    model = sm.OLS(y, X_corr).fit()
    print(model.summary())

In [None]:

df_checkup_nm_sg_pm_1_edu["log_age.0"] =  np.log(df_checkup_nm_sg_pm_1_edu["age_visit.0"])
df_checkup_nm_sg_pm_edu["log_age.0"] =  np.log(df_checkup_nm_sg_pm_edu["age_visit.0"])

df_checkup_pm_1_sg_edu["log_age.0"] =  np.log(df_checkup_pm_1_sg_edu["age_visit.0"])
df_checkup_pm_sg_edu["log_age.0"] =  np.log(df_checkup_pm_sg_edu["age_visit.0"])

df_checkup_nm_sg_edu["log_age.0"] =  np.log(df_checkup_nm_sg_edu["age_visit.0"])

predictors=["log_time_difference.0.2","education_level.0", "age_visit.0", "log_age.0"]


In [None]:

model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "numeric_memory_change.0.2")
model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "pairs_matching_change_round_1.0.2")
model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "snap_game_true_pos_rt_avrg_change.0.2")

model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "numeric_memory_change_rate.0.2")
model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "pairs_matching_change_rate_round_1.0.2")
model_builder(df_checkup_nm_sg_pm_1_edu, predictors, "snap_game_true_pos_rt_avrg_change_rate.0.2")


In [None]:

model_builder(df_checkup_nm_sg_pm_edu, predictors, "numeric_memory_change.0.2")
model_builder(df_checkup_nm_sg_pm_edu, predictors, "pairs_matching_sum_incorrect_change.0.2")
model_builder(df_checkup_nm_sg_pm_edu, predictors, "snap_game_true_pos_rt_avrg_change.0.2")

model_builder(df_checkup_nm_sg_pm_edu, predictors, "numeric_memory_change_rate.0.2")
model_builder(df_checkup_nm_sg_pm_edu, predictors, "pairs_matching_sum_incorrect_change_rate.0.2")
model_builder(df_checkup_nm_sg_pm_edu, predictors, "snap_game_true_pos_rt_avrg_change_rate.0.2")


In [None]:

model_builder(df_checkup_pm_1_sg_edu, predictors, "pairs_matching_change_round_1.0.2")
model_builder(df_checkup_pm_1_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change.0.2")


model_builder(df_checkup_pm_1_sg_edu, predictors, "pairs_matching_change_speed_round_1.0.2")
model_builder(df_checkup_pm_1_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change_rate.0.2")

In [None]:

model_builder(df_checkup_pm_sg_edu, predictors, "pairs_matching_sum_incorrect_change.0.2")
model_builder(df_checkup_pm_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change.0.2")

model_builder(df_checkup_pm_sg_edu, predictors, "pairs_matching_sum_incorrect_change_rate.0.2")
model_builder(df_checkup_pm_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change_rate.0.2")

In [None]:
model_builder(df_checkup_nm_sg_edu, predictors, "numeric_memory_change.0.2")
model_builder(df_checkup_nm_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change.0.2")

model_builder(df_checkup_nm_sg_edu, predictors, "numeric_memory_change_rate.0.2")
model_builder(df_checkup_nm_sg_edu, predictors, "snap_game_true_pos_rt_avrg_change_rate.0.2")

In [None]:
show_trends(df_checkup_nm_sg_pm_1_edu, "age_visit.0","time_difference.0.2", 1, "nm_sg_pm_1_edu")
show_trends(df_checkup_nm_sg_pm_edu, "age_visit.0","time_difference.0.2", 1, "nm_sg_pm_edu")
show_trends(df_checkup_pm_1_sg_edu, "age_visit.0","time_difference.0.2", 1, "pm_1_sg_edu")
show_trends(df_checkup_pm_sg_edu, "age_visit.0","time_difference.0.2", 1, "pm_sg_edu")
show_trends(df_checkup_nm_sg_edu, "age_visit.0","time_difference.0.2", 1, "nm_sg_edu")