In [None]:
import pandas as pd
import numpy as np
import os
import glob
import sys

In [None]:
data_f = "/u/scratch/b/blhill/UKBB_phenotypes_merged.tsv"

In [None]:
df = pd.read_csv(data_f, header=0, sep="\t")
print(df.shape)

In [None]:
# get only the columns that were taken at initial assessment 
df = df[list(df.columns.values[df.columns.str.contains("-0")]) + list(["Encoded anonymised participant ID_eid"])]

In [None]:
columns_of_interest = {
    "Age completed full time education_845-0.0": "age_education",
    "Age when attended assessment centre_21003-0.0": "age",
    "Average total household income before tax_738-0.0": "average_income",
    "Body fat percentage_23099-0.0": "body_fat",
    "Body mass index (BMI)_21001-0.0": "bmi",
    #"Current tobacco smoking_1239-0.0": "current_tobacco_smoking",
    "Diabetes diagnosed by doctor_2443-0.0": "diabetes",
    "Diastolic blood pressure, automated reading_4079-0.0": "diastolic_blood_pressure",
    "Encoded anonymised participant ID_eid": "EID",
    "Ethnic background_21000-0.0": "ethnicity",
    "Frequency of drinking alcohol_20414-0.0": "frequency_alcohol",
    "Alcohol intake frequency._1558-0.0": "alcohol_frequency",
    "Genetic sex_22001-0.0": "genetic_sex",
    "Genotype measurement batch_22000-0.0": "genotype_batch",
    "Genotype measurement plate_22007-0.0": "genotype_plate",
    "Genotype measurement well_22008-0.0": "genotype_well",
    #"derived_job_class": "derived_job_class",
    "derived_kinship": "derived_kinship",
    "derived_medication_cholesterol_blood_pressure_diabetes": "derived_medication_cholesterol_blood_pressure_diabetes",
    "Outliers for heterozygosity or missing rate_22027-0.0": "outliers_heterozygosity_missing_rate",
    "Pulse rate, automated reading_102-0.0": "pulse_rate",
    "derived_socioeconomic_status": "derived_socioeconomic_status",
    "Sex chromosome aneuploidy_22019-0.0": "sex_chromosome_aneuploidy",
    "Sex_31-0.0": "sex",
    "Smoking status_20116-0.0": "smoking_status",
    "Standing height_50-0.0": "height",
    "Systolic blood pressure, automated reading_4080-0.0": "systolic_blood_pressure",
    "Townsend deprivation index at recruitment_189-0.0": "townsend",
    "UK Biobank assessment centre_54-0.0": "assessment_centre",
    "derived_chd": "derived_chd",
    "Weight_21002-0.0": "weight"
}

for pc in df.columns.values[df.columns.str.startswith("Genetic principal components")]:
    pc_num = pc.split(".")[1]
    columns_of_interest[pc] = "PC{}".format(pc_num)
    
for outcome in df[df.columns.values[df.columns.str.startswith("Vascular/heart problems diagnosed by doctor_6150-0")]].iloc[:, 0].unique():
    columns_of_interest["derived_{}".format(outcome)] = "derived_{}".format(outcome)
    
for k, v in columns_of_interest.items():
    print(k, v)

# Genetic kinship

In [None]:
def get_kinship(row):
    acceptable_values=["No kinship found"]
    if row in acceptable_values:
        return 0
    elif row in ["Participant excluded from kinship inference process"]:
        return None
    else:
        return 1
    
cols_to_check = df.columns.values[df.columns.str.startswith("Genetic kinship to other participants_22021-0")]

for c in cols_to_check:
    df[c] = df[c].apply(get_kinship)
    
df["derived_kinship"] = df[cols_to_check].any(axis="columns")

# Heart problems

In [None]:
cols_to_check = df.columns.values[df.columns.str.startswith("Vascular/heart problems diagnosed by doctor_6150-0")]

def get_heart_probs(row, val=None):
    if row in [val]:
        return 1
    elif row in ["Prefer not to answer"]:
        return None
    else:
        return 0

for outcome in df[df.columns.values[df.columns.str.startswith("Vascular/heart problems diagnosed by doctor_6150-0")]].iloc[:, 0].unique():
    print(outcome)
    all_cols = []
    for c in cols_to_check:
        df["{}_{}".format(c, outcome)] = df[c].apply(get_heart_probs, val=outcome)
        print(c, df["{}_{}".format(c, outcome)].sum())
        all_cols.append("{}_{}".format(c, outcome))
    df["derived_{}".format(outcome)] = df[all_cols].any(axis="columns")

# Blood pressure medication

In [None]:
# from BMJ paper: dichotomous variable, 1 if degree level == collge or professional, else 0
def get_blood_pressure_medication(row):
    acceptable_values=["Blood pressure medication"]
    if row in acceptable_values:
        return 1
    elif row in ["Prefer not to answer"]:
        return None
    else:
        return 0
    
cols_to_check = df.columns.values[df.columns.str.startswith("Medication for cholesterol, blood pressure or diabetes_6177-0")]

for c in cols_to_check:
    df[c] = df[c].apply(get_blood_pressure_medication)
    
df["derived_medication_cholesterol_blood_pressure_diabetes"] = df[cols_to_check].any(axis="columns")

## Degree level variable

In [None]:
# from BMJ paper: dichotomous variable, 1 if degree level == collge or professional, else 0
def get_socioeconomic_status(row):
    acceptable_values=["College or University degree", "Other professional qualifications eg: nursing, teaching"]
    if row in acceptable_values:
        return 1
    elif row in ["Prefer not to answer"]:
        return None
    else:
        return 0
    
cols_to_check = df.columns.values[df.columns.str.startswith("Qualifications")]

for c in cols_to_check:
    df[c] = df[c].apply(get_socioeconomic_status)
    
df["derived_socioeconomic_status"] = df[cols_to_check].any(axis="columns")

In [None]:
df["derived_chd"] = df[["derived_Heart attack", "derived_Angina"]].any(axis="columns")

In [None]:
# NOTE: commented this out to add in all (continuous) traits

# get only the columns we want
# df_filtered = df[columns_of_interest.keys()]
df_filtered = df

In [None]:
df_filtered.rename(columns=columns_of_interest, inplace=True)

In [None]:
df_filtered.head()

In [None]:
# merge with PRS values
split1_prs_df = pd.read_csv("/u/home/n/nlapier2/project-ukbiobank/data/mr_ukb_split/prs/jama_bmi_prs_split1.txt", 
                            header=None, sep="\t")
split1_prs_df.columns = ["EID", "PRS_split1"]

split2_prs_df = pd.read_csv("/u/home/n/nlapier2/project-ukbiobank/data/mr_ukb_split/prs/jama_bmi_prs_split2.txt", 
                            header=None, sep="\t")
split2_prs_df.columns = ["EID", "PRS_split2"]

In [None]:
df_filtered = df_filtered.merge(right=split1_prs_df, on="EID", how="left")
df_filtered = df_filtered.merge(right=split2_prs_df, on="EID", how="left")

In [None]:
print(df_filtered.shape)
df_filtered.head()

In [None]:
df_filtered.to_csv("/u/scratch/b/blhill/UKBB_features.tsv", header=True, sep="\t", index=False)