In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
## Tabular Data
data_dir = "/home/nikhil/projects/brain_changes/data/ukbb/"

tab_csv = f"{data_dir}tabular/ukb46307.csv"

tab_follow_up_csv = f"{data_dir}tabular/tab_follow_up.csv"

follow_up_subjects = "./ukbb_brain-age_ses-3_subjects.txt"

covid_subjects = f"{data_dir}tabular/covid19_result_england.txt"

## Covid data

In [3]:
ukbb_covid_df = pd.read_csv(covid_subjects,delimiter="\t")
ukbb_covid_df["specdate"] = pd.to_datetime(ukbb_covid_df["specdate"], yearfirst=True)
ukbb_covid_df.head()

Unnamed: 0,eid,specdate,spectype,laboratory,origin,result,acute,hosaq,reqorg
0,1000032,2020-12-22,4,151,0,0,0,-1,2
1,1000044,2021-02-22,2,40,1,0,1,-1,1
2,1000060,2020-11-21,4,66,0,0,0,-1,6
3,1000150,2020-10-17,4,14,1,0,1,-1,2
4,1000150,2021-01-15,4,5,1,0,1,-1,2


In [4]:

subjects_tested = ukbb_covid_df["eid"].unique()
subjects_positive = ukbb_covid_df[ukbb_covid_df["result"]==1]["eid"].unique()
n_positive_tests = len(ukbb_covid_df[ukbb_covid_df["result"]==1])

print(f"number of positive tests: {n_positive_tests} out of {len(ukbb_covid_df)} covid tests")
print(f"number of positive subjects: {len(subjects_positive)} out of {len(subjects_tested)} tested subjects")

number of positive tests: 23614 out of 237809 covid tests
number of positive subjects: 18005 out of 104194 tested subjects


## Tabular data

In [5]:
# ukbb_tab_df = pd.read_csv(tab_csv, index_col=["eid"], usecols=["eid","41001-2.0"])

## Follow-up data

In [9]:
ukbb_tab_followup_df = pd.read_csv(tab_follow_up_csv)
print(f"number of subjects with follow-up scans: {len(ukbb_tab_followup_df)}")
ukbb_tab_followup_df["imaging date-ses2"] = pd.to_datetime(ukbb_tab_followup_df["imaging date-ses2"], yearfirst=True)
ukbb_tab_followup_df["imaging date-ses3"] = pd.to_datetime(ukbb_tab_followup_df["imaging date-ses3"], yearfirst=True)
ukbb_tab_followup_df.head()

number of subjects with follow-up scans: 3202


Unnamed: 0,eid,sex,birth_year,imaging date-ses2,imaging date-ses3,T1-ses2,T1-ses3,ethnicity,age_at_ses2,age_at_ses3,age_at_recruitment
0,1000635,1.0,1950.0,2017-12-21,2020-03-09,20252_2_0,20252_3_0,1001.0,67.0,69.0,58.0
1,1004084,1.0,1947.0,2017-05-23,2019-10-23,20252_2_0,20252_3_0,1001.0,70.0,72.0,60.0
2,1008391,1.0,1955.0,2017-10-13,2021-02-17,20252_2_0,20252_3_0,1001.0,62.0,65.0,54.0
3,1010063,0.0,1964.0,2017-07-05,2019-10-29,20252_2_0,20252_3_0,1001.0,53.0,55.0,45.0
4,1010129,0.0,1947.0,2017-08-16,2019-11-17,20252_2_0,20252_3_0,1001.0,69.0,71.0,60.0


## Common subject between covid study and ukbb with follow-up scan

In [10]:
covid_plus_followup_scan_subjects = set(subjects_tested) & set(ukbb_tab_followup_df["eid"])
print(f"number of covid study subjects with follow-up scan: {len(covid_plus_followup_scan_subjects)}")

number of covid study subjects with follow-up scan: 661


In [28]:
ukbb_covid_follow_up_df = pd.merge(ukbb_covid_df,ukbb_tab_followup_df, on="eid", how="inner")

print(ukbb_covid_follow_up_df.shape)

subjects_with_scans_after_covid_df = ukbb_covid_follow_up_df[ukbb_covid_follow_up_df["imaging date-ses3"] > ukbb_covid_follow_up_df["specdate"]]
subjects_with_scans_after_covid = subjects_with_scans_after_covid["eid"].unique()
print(f"number of subjects with scan after covid: {len(subjects_with_scans_after_covid)}")

(1131, 19)
number of subjects with scan after covid: 73


In [30]:
subjects_with_scans_after_covid_df

Unnamed: 0,eid,specdate,spectype,laboratory,origin,result,acute,hosaq,reqorg,sex,birth_year,imaging date-ses2,imaging date-ses3,T1-ses2,T1-ses3,ethnicity,age_at_ses2,age_at_ses3,age_at_recruitment
0,1008391,2021-01-07,31,128,0,0,0,-1,2,1.0,1955.0,2017-10-13,2021-02-17,20252_2_0,20252_3_0,1001.0,62.0,65.0,54.0
12,1047541,2020-10-29,4,89,0,1,0,-1,6,1.0,1956.0,2016-10-07,2021-02-04,20252_2_0,20252_3_0,1001.0,60.0,64.0,53.0
54,1259679,2021-01-13,4,89,0,1,0,-1,6,1.0,1961.0,2014-07-18,2021-02-17,20252_2_0,20252_3_0,1001.0,52.0,59.0,45.0
58,1272589,2020-01-11,4,89,0,1,0,-1,6,0.0,1966.0,2015-03-08,2021-02-07,20252_2_0,20252_3_0,1001.0,48.0,54.0,40.0
119,1616426,2020-12-13,4,89,0,1,0,-1,6,0.0,1959.0,2016-10-08,2021-02-21,20252_2_0,20252_3_0,1001.0,56.0,61.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,5610282,2020-10-15,2,68,1,1,1,-1,1,1.0,1940.0,2016-08-31,2021-03-04,20252_2_0,20252_3_0,1001.0,75.0,80.0,69.0
1033,5610282,2020-04-11,2,68,1,1,1,-1,2,1.0,1940.0,2016-08-31,2021-03-04,20252_2_0,20252_3_0,1001.0,75.0,80.0,69.0
1034,5610282,2020-10-29,2,68,1,1,1,-1,2,1.0,1940.0,2016-08-31,2021-03-04,20252_2_0,20252_3_0,1001.0,75.0,80.0,69.0
1035,5610282,2020-10-29,2,68,1,1,1,-1,1,1.0,1940.0,2016-08-31,2021-03-04,20252_2_0,20252_3_0,1001.0,75.0,80.0,69.0


In [31]:
a = ukbb_covid_follow_up_df["imaging date-ses3"] > "2020-04-01"
np.sum(a)

140

In [33]:
subjects_with_scans_after_covid_df.drop_duplicates(["eid"])["result"].value_counts()

1    59
0    14
Name: result, dtype: int64