In [290]:
import sys
path = '/gpfs/commons/groups/gursoy_lab/mstoll/'
DATA_DIR = '/gpfs/commons/datasets/controlled/ukbb-gursoylab'
sys.path.append(path)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#load data
conditions = pd.read_csv(f'{DATA_DIR}/omop_processed/ukbb_omop_filtered.csv')
hierarchy = pd.read_csv(f'{DATA_DIR}/omop/omop_concept_ancestor.csv', sep = '\t')
concepts = pd.read_csv(f'{DATA_DIR}/omop/omop_concept.csv', sep = '\t')
concepts = concepts[concepts['standard_concept'].isin(['S', 'C'])]

In [147]:
#Filter hierarchy table for included codes
concepts_inc = conditions['concept_id'].unique()
hierarchy_inc = hierarchy[hierarchy['ancestor_concept_id'].isin(concepts_inc) | hierarchy['descendant_concept_id'].isin(concepts_inc)]
concept_ids = concepts[['concept_id', 'concept_name']]

In [148]:
#identify the parent code
max_sep = hierarchy_inc['max_levels_of_separation'].max()
parent_codes = hierarchy_inc[hierarchy_inc['max_levels_of_separation'] == max_sep]['ancestor_concept_id'].unique()
parent_code = hierarchy_inc[hierarchy_inc['descendant_concept_id'].isin(parent_codes)]['ancestor_concept_id'].iloc[0]

In [156]:
#Get codes 4 levels from parent code
roll_up_depth = 4
roll_up_codes = hierarchy_inc['descendant_concept_id'][(hierarchy_inc['ancestor_concept_id']== parent_code) & (hierarchy_inc['min_levels_of_separation'] <= roll_up_depth)].unique()
roll_up_codes_exact_4 =hierarchy_inc['descendant_concept_id'][(hierarchy_inc['ancestor_concept_id']== parent_code) & (hierarchy_inc['min_levels_of_separation'].isin([3,4]))].unique()

In [157]:
#Identify codes that are also children of other codes that are also exactly 4 levels from the parent and remove them (these are more specific codes that should be rolled up)
self_roll_up = hierarchy_inc[ hierarchy_inc['ancestor_concept_id'].isin(roll_up_codes_exact_4) & 
                             hierarchy_inc['descendant_concept_id'].isin(roll_up_codes_exact_4) &
                            hierarchy_inc['min_levels_of_separation'] >= 1]
self_roll_up = self_roll_up.merge(
    concept_ids, left_on = 'ancestor_concept_id', right_on = 'concept_id').merge(
        concept_ids, left_on = 'descendant_concept_id', right_on = 'concept_id')

self_roll_up_names = list(set(self_roll_up['concept_name_y'].unique()) - set(self_roll_up['concept_name_x'].unique()))
self_roll_up_codes = concept_ids['concept_id'][concept_ids['concept_name'].isin(self_roll_up_names)].unique()
roll_up_codes = list(set(roll_up_codes) - set(self_roll_up_codes))
len(roll_up_codes)

In [162]:
#Save name of phenotypes we will take forward
concept_names = concepts[concepts['concept_id'].isin(roll_up_codes)]['concept_name'].unique()
concept_names = pd.DataFrame(concept_names, columns=['Concept Name'])
concept_names.to_csv('/gpfs/commons/groups/gursoy_lab/aelhussein/phewas/PheWas/cohort/concept_names.csv', index=False)

In [169]:
#Delete parent codes from the mapping
roll_up_codes = list(set(roll_up_codes) - set(parent_codes))

2554

In [176]:
#Mapping of how codes are rolled-up (mapping_dict_anc_des)  and how each code is rolled up (mapping_dict_des_ans)
hierarchy_mapping = hierarchy_inc[hierarchy_inc['ancestor_concept_id'].isin(roll_up_codes)]
mapping_dict_anc_des = hierarchy_mapping.groupby('ancestor_concept_id')['descendant_concept_id'].unique().apply(list).to_dict()
mapping_dict_des_ans = hierarchy_mapping.groupby('descendant_concept_id')['ancestor_concept_id'].unique().apply(list).to_dict()

In [188]:
#Blow up the conditions table with all codes that it can be rolled up to (this is different to the previous appraoch but makes sense as a condition id can represent different diseases )
columns = ['eid', 'condition_occurrence_count', 'concept_id']
new_rows = []
for index, row in conditions.iterrows():
    if row['concept_id'] in mapping_dict_des_ans:
        for new_concept_id in mapping_dict_des_ans[row['concept_id']]:
            new_row = {
                'eid': row['eid'], 
                'condition_occurrence_count': row['condition_occurrence_count'], 
                'concept_id': new_concept_id
            }
            new_rows.append(new_row)
conditions_rolled_up = pd.DataFrame(new_rows,  columns=columns)
conditions_rolled_up = conditions_rolled_up.merge(concept_ids, on = 'concept_id')
conditions_rolled_up.to_csv(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up.csv', index = False)

In [258]:
conditions_rolled_up = pd.read_csv(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up.csv')

In [259]:
condtions_pivot_table = conditions_rolled_up.pivot_table(
    values='condition_occurrence_count', 
    index='eid', 
    columns='concept_id', 
    aggfunc='sum',
    fill_value=0
)

In [260]:
condtions_pivot_table.to_pickle(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up_pivot.pkl')

## Second filtering dataset

In [229]:
# Do a fruther refinement based on codes with < 500 occurrences
conditions_value_counts = conditions_rolled_up['concept_name'].value_counts()
conds_filter = conditions_value_counts[conditions_value_counts < 750].index
conds_id_filter = concept_ids['concept_id'][concept_ids['concept_name'].isin(conds_filter)].unique()

In [260]:
roll_up_codes = list(set(conditions_rolled_up['concept_id']) - set(conds_id_filter))
hierarchy_mapping = hierarchy_inc[hierarchy_inc['ancestor_concept_id'].isin(roll_up_codes)]
mapping_dict_anc_des = hierarchy_mapping.groupby('ancestor_concept_id')['descendant_concept_id'].unique().apply(list).to_dict()
mapping_dict_des_ans = hierarchy_mapping.groupby('descendant_concept_id')['ancestor_concept_id'].unique().apply(list).to_dict()

In [262]:
#Blow up the conditions table with all codes that it can be rolled up to (this is different to the previous appraoch but makes sense as a condition id can represent different diseases )
columns = ['eid', 'condition_occurrence_count', 'concept_id']
new_rows = []
for index, row in conditions.iterrows():
    if row['concept_id'] in mapping_dict_des_ans:
        for new_concept_id in mapping_dict_des_ans[row['concept_id']]:
            new_row = {
                'eid': row['eid'], 
                'condition_occurrence_count': row['condition_occurrence_count'], 
                'concept_id': new_concept_id
            }
            new_rows.append(new_row)
conditions_rolled_up = pd.DataFrame(new_rows,  columns=columns)
conditions_rolled_up = conditions_rolled_up.merge(concept_ids, on = 'concept_id')
conditions_rolled_up = conditions_rolled_up[conditions_rolled_up['concept_name'] != 'Disorder of body system']
conditions_rolled_up.to_csv(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up_2.csv', index = False)

In [None]:
condtions_pivot_table = conditions_rolled_up.pivot_table(
    values='condition_occurrence_count', 
    index='eid', 
    columns='concept_id', 
    aggfunc='sum',
    fill_value=0
)

In [None]:
condtions_pivot_table.to_pickle(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up_pivot_2.pkl')

# Quick tree model check

In [280]:
condtions_pivot_table = pd.read_pickle(f'{DATA_DIR}/aelhussein/phewas/data/conditions_rolled_up_pivot_2.pkl')
condtions_pivot_table

concept_id,23986,24609,24818,24966,26378,26662,26711,27674,29056,29735,...,44783158,44784105,44784106,44784217,45757635,45757810,45766466,45766714,45772881,46273620
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000035,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,0
1000044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6023666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6023673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6023680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6023697,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [70]:
labels = pd.read_csv(f'{DATA_DIR}/aelhussein/phewas/data/rs12203592', sep = '\t', skiprows = 9, nrows =1)
labels = labels.iloc[:,9:].T

In [71]:
labels.index = labels.index.str.split('_').str[0].astype(int)
labels.columns = ['label']
mapping_dict = {'0/0':0, '0/1':1, '1/1':1, './.':np.nan}
labels['label'] = labels['label'].replace(mapping_dict)

In [281]:
data = condtions_pivot_table.merge(labels, left_index = True, right_index = True)
data = data.dropna()
data_bin = (data > 0).astype(int)

In [282]:
data_0 = data_bin[data_bin['label'] == 0]
data_1 = data_bin[data_bin['label'] == 1]
data_0 = data_0[data_0.iloc[:,:-1].mean(axis = 1) > 0.05]
data_1 = data_1[data_1.iloc[:,:-1].mean(axis = 1) > 0.05]
data_1_subsample = data_1.sample(n=data_0.shape[0], random_state=42)
balanced_data = pd.concat([data_0, data_1_subsample]) 

In [289]:
X = balanced_data.drop(columns=['label']) 
y = balanced_data['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

In [294]:
y_pred = best_rf.predict(X_val)
y_pred_proba = best_rf.predict_proba(X_val)[:, 1]
accuracy = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_proba)
print(f"Validation AUC: {auc:.4f}, Validation Accuracy: {accuracy:.4f}")

Validation AUC: 0.5748, Validation Accuracy: 0.5523


In [295]:
#Feature importance
feature_importances_rf = best_rf.feature_importances_
feature_names = X_train.columns 
df_feature_importances_rf = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances_rf
}).sort_values(by='Importance', ascending=False)
df_feature_importances_rf = df_feature_importances_rf.merge(concepts[['concept_id', 'concept_name']], left_on = 'Feature', right_on = 'concept_id')
df_feature_importances_rf.iloc[:10]

Unnamed: 0,Feature,Importance,concept_id,concept_name
0,138825,0.007995,138825,Actinic keratosis
1,4155297,0.005128,4155297,Malignant neoplasm of skin
2,4189459,0.005076,4189459,Disorders of skin induced by physical agents
3,4153882,0.003689,4153882,Malignant neoplasm of soft tissue
4,316866,0.003477,316866,Hypertensive disorder
5,75909,0.003454,75909,Disorder of bone
6,443783,0.003435,443783,Chronic disease
7,75865,0.00343,75865,Disorder of the urinary system
8,4043371,0.003402,4043371,Inflammatory disorder of digestive tract
9,4027553,0.003387,4027553,Disorder of lower respiratory system


In [None]:
xgb_param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4]
}

xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_random_search = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=xgb_param_dist, 
                                       n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
xgb_random_search.fit(X_train, y_train)
best_xgb = xgb_random_search.best_estimator_

In [300]:
y_pred = best_xgb.predict(X_val)
y_pred_proba = best_xgb.predict_proba(X_val)[:, 1]
accuracy = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_proba)
print(f"Validation AUC: {auc:.4f}, Validation Accuracy: {accuracy:.4f}")

Validation AUC: 0.5864, Validation Accuracy: 0.5510


In [302]:
feature_importances_xgb = best_xgb.get_booster().get_score(importance_type='weight')
df_feature_importances_xgb = pd.DataFrame({
    'Feature': list(feature_importances_xgb.keys()),
    'Importance': list(feature_importances_xgb.values())
}).sort_values(by='Importance', ascending=False)
df_feature_importances_xgb['Feature'] = df_feature_importances_xgb['Feature'].astype(int)
df_feature_importances_xgb = df_feature_importances_xgb.merge(concepts[['concept_id', 'concept_name']], left_on = 'Feature', right_on = 'concept_id')
df_feature_importances_xgb.iloc[:10]

Unnamed: 0,Feature,Importance,concept_id,concept_name
0,138825,116.0,138825,Actinic keratosis
1,4155297,106.0,4155297,Malignant neoplasm of skin
2,4171257,71.0,4171257,Melanocytic tumor of skin
3,138102,45.0,138102,Benign neoplasm of skin
4,4147145,42.0,4147145,Tendinitis
5,256451,39.0,256451,Bronchitis
6,4181583,33.0,4181583,Upper respiratory infection
7,4343223,31.0,4343223,Epidermal nevus
8,4028367,31.0,4028367,Acute disease of cardiovascular system
9,432868,30.0,432868,Hemoglobinopathy
