In [1]:
import numpy as np
import pandas as pd

In [2]:
lavertu = pd.read_csv('../raw_data/side_effect_severity_ranking_lavertu.csv')
gottlieb = pd.read_csv('../raw_data/side_effect_severity_ranking_gottlieb.csv')

In [3]:
# cleaning the side effect severity scoring dataframes
lavertu = lavertu.drop(columns=['cui', 'pt_code', 'saedr_score_std'])
lavertu.columns = ['side_effect', 'score_lavertu']
gottlieb = gottlieb.drop(columns='Rank Stdev (% out 2929)')
gottlieb.columns = ['side_effect', 'score_gottlieb']
gottlieb['side_effect'] = gottlieb['side_effect'].str.lower().str.replace('-', ' ')

In [4]:
twosides_labels = pd.read_csv('../raw_data/mt_reclassified_twosides_labels.csv')

In [5]:
merged_df = twosides_labels.merge(lavertu, on='side_effect', how='left').merge(gottlieb, on='side_effect', how='left')

In [6]:
def score_category(score, gottlieb_or_lavertu):
    if gottlieb_or_lavertu == 'lavertu':
        if score < 0.4:
            score = 1
        elif score < 0.8:
            score = 2
        elif score >= 0.8:
            score = 3
        else:
            score = 0
    elif gottlieb_or_lavertu == 'gottlieb':
        if score < 0.35:
            score = 1
        elif score < 0.7:
            score = 2
        elif score >= 0.7:
            score = 3
        else:
            score = 0
    return score

In [7]:
merged_df['score_lavertu_cat'] = merged_df['score_lavertu'].apply(score_category, args=('lavertu',))

In [8]:
merged_df['score_gottlieb_cat'] = merged_df['score_gottlieb'].apply(score_category, args=('gottlieb',))

In [9]:
merged_df['score_consolidated'] = 0

In [10]:
merged_df

Unnamed: 0,Y,side_effect,sub_system,sub_system_v2,score_lavertu,score_gottlieb,score_lavertu_cat,score_gottlieb_cat,score_consolidated
0,91,biliary tract disorder,bile_duct_gallbladder,bile_duct_gallbladder_pancreas,0.550246,0.48,2,2,0
1,1166,gallbladder polyp,kidneys_bladder_urethra,bile_duct_gallbladder_pancreas,0.615268,0.46,2,2,0
2,974,cholecystitis chronic,bile_duct_gallbladder,bile_duct_gallbladder_pancreas,0.662647,0.44,2,2,0
3,163,cholelithiasis,bile_duct_gallbladder,bile_duct_gallbladder_pancreas,0.668264,0.51,2,2,0
4,162,cholecystitis,bile_duct_gallbladder,bile_duct_gallbladder_pancreas,0.676486,0.56,2,2,0
...,...,...,...,...,...,...,...,...,...
1312,745,pyuria,kidneys_bladder_urethra,urinary_system_general,0.659667,0.44,2,2,0
1313,356,glucosuria,kidneys_bladder_urethra,urinary_system_general,,,0,0,0
1314,386,blood in urine,kidneys_bladder_urethra,urinary_system_general,,,0,0,0
1315,389,hemoglobinuria,kidneys_bladder_urethra,urinary_system_general,,,0,0,0


In [11]:
for index, row in merged_df.iterrows():
    if row['score_lavertu_cat'] == row['score_gottlieb_cat']:
        merged_df.loc[index, 'score_consolidated'] = (row['score_lavertu_cat'] + row['score_gottlieb_cat'])/2
    elif row['score_lavertu_cat'] == 0 or row['score_gottlieb_cat'] == 0:
        merged_df.loc[index, 'score_consolidated'] = (row['score_lavertu_cat'] + row['score_gottlieb_cat'])

In [12]:
merged_df['score_consolidated'].value_counts()

0    668
2    537
1     80
3     32
Name: score_consolidated, dtype: int64

In [48]:
merged_df.to_csv('v2.csv', index=False)

# Twosides

In [69]:
reclass = pd.read_csv('../raw_data/mt_reclassified_twosides_labels_final.csv')

In [70]:
reclass['sub_system_severity'] = reclass['sub_system_v2'] + '_' + reclass['score_consolidated'].apply(str)

In [71]:
reclass = reclass[['Y', 'side_effect', 'sub_system_severity']].dropna().copy()

In [72]:
reclass

Unnamed: 0,Y,side_effect,sub_system_severity
0,91,biliary tract disorder,bile_duct_gallbladder_pancreas_2
1,1166,gallbladder polyp,bile_duct_gallbladder_pancreas_2
2,974,cholecystitis chronic,bile_duct_gallbladder_pancreas_2
3,163,cholelithiasis,bile_duct_gallbladder_pancreas_2
4,162,cholecystitis,bile_duct_gallbladder_pancreas_2
...,...,...,...
1312,1229,abnormal laboratory findings,zzz_delete_0
1313,45,animal bite,zzz_delete_0
1314,1300,infestation,zzz_delete_0
1315,1200,splinter,zzz_delete_0


In [73]:
from sklearn.preprocessing import LabelEncoder

In [74]:
le = LabelEncoder()
reclass['Y_cat'] = le.fit_transform(reclass['sub_system_severity'])

In [75]:
reclass

Unnamed: 0,Y,side_effect,sub_system_severity,Y_cat
0,91,biliary tract disorder,bile_duct_gallbladder_pancreas_2,0
1,1166,gallbladder polyp,bile_duct_gallbladder_pancreas_2,0
2,974,cholecystitis chronic,bile_duct_gallbladder_pancreas_2,0
3,163,cholelithiasis,bile_duct_gallbladder_pancreas_2,0
4,162,cholecystitis,bile_duct_gallbladder_pancreas_2,0
...,...,...,...,...
1312,1229,abnormal laboratory findings,zzz_delete_0,87
1313,45,animal bite,zzz_delete_0,87
1314,1300,infestation,zzz_delete_0,87
1315,1200,splinter,zzz_delete_0,87


In [76]:
reclass.to_csv('mt_reclassification_encoded.csv', index=False)

In [31]:
ts = pd.read_csv('../raw_data/twosides.csv')

In [32]:
ts

Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2,Y
0,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,767
1,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,25
2,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,85
3,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,735
4,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,959
...,...,...,...,...,...
4649436,CID000000453,C(C(C(C(C(CO)O)O)O)O)O,CID000004595,CC1=NC=CN1CC2CCC3=C(C2=O)C4=CC=CC=C4N3C,912
4649437,CID000005391,CN1C2=C(C=C(C=C2)Cl)C(=NC(C1=O)O)C3=CC=CC=C3,CID000068740,C1=CN(C=N1)CC(O)(P(=O)(O)O)P(=O)(O)O,1281
4649438,CID000002909,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,CID000005978,CCC1(CC2CC(C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(...,289
4649439,CID000003937,C1CC(N(C1)C(=O)C(CCCCN)NC(CCC2=CC=CC=C2)C(=O)O...,CID000077999,CN(CCOC1=CC=C(C=C1)CC2C(=O)NC(=O)S2)C3=CC=CC=N3,2


In [34]:
ts = ts.merge(reclass[['Y', 'Y_cat', 'sub_system_severity']], on='Y')

In [98]:
twosides_df[twosides_df['Y_cat'] == 61]

Unnamed: 0,Y,sub_system_v2,score_consolidated,concat,Y_cat
801,33,muscles_tendons,2,muscles_tendons_2,61


In [35]:
ts

Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2,Y,Y_cat,sub_system_severity
0,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,767,32,eyes_2
1,CID000002541,CCOC1=NC2=CC=CC(=C2N1CC3=CC=C(C=C3)C4=CC=CC=C4...,CID000003440,C1=COC(=C1)CNC2=CC(=C(C=C2C(=O)O)S(=O)(=O)N)Cl,767,32,eyes_2
2,CID000000772,CC(=O)NC1C(C(C(OC1O)COS(=O)(=O)O)OC2C(C(C(C(O2...,CID000002656,CN1C(=NC(=O)C(=N1)[O-])SCC2=C(N3C(C(C3=O)NC(=O...,767,32,eyes_2
3,CID000001972,CC1C=CC=CC=CC=CC=CC=CC=CC(CC2C(C(CC(O2)(CC(CC(...,CID000002764,C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)N4CCNCC4)F)C(=O)O,767,32,eyes_2
4,CID000002462,CCCC1OC2CC3C4CCC5=CC(=O)C=CC5(C4C(CC3(C2(O1)C(...,CID000003016,CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3,767,32,eyes_2
...,...,...,...,...,...,...,...
4649436,CID000004900,CC12CC(=O)C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C,CID000005372,CC1CC(C2C(CC(C(O2)(C(=O)C(=O)N3CCCCC3C(=O)OC(C...,747,16,brain_spinal_cord_2
4649437,CID000003937,C1CC(N(C1)C(=O)C(CCCCN)NC(CCC2=CC=CC=C2)C(=O)O...,CID000004900,CC12CC(=O)C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C,219,77,skin_subcutaneous fat_1
4649438,CID000003075,CC(=O)OC1C(SC2=CC=CC=C2N(C1=O)CC[NH+](C)C)C3=C...,CID000003780,C1C(C2C(O1)C(CO2)O[N+](=O)[O-])O[N+](=O)[O-],219,77,skin_subcutaneous fat_1
4649439,CID000004585,CC1=CC2=C(NC3=CC=CC=C3N=C2S1)N4CCN(CC4)C,CID000060787,CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(C(CC3=CC=CC=C3)...,501,75,reproductive_system_general_2


In [36]:
ts['sub_system_severity'].value_counts()

heart_2                                           275258
blood_2                                           229020
lungs_2                                           202675
skin_subcutaneous fat_2                           197843
brain_spinal_cord_2                               190274
                                                   ...  
delete_1                                             826
thyroid_pituitary_gland_adrenal_parathyroids_3       779
motor_3                                              696
joints_connective_tissues_3                          502
reproductive_system_female_1                          36
Name: sub_system_severity, Length: 90, dtype: int64

# Misc

In [None]:
def deciding_cutoff_boundaries(cutoff_range):
    '''for both lavertu and gottlieb, to decide on how to split the side effects into three categories (mild, moderate, severe)
    so that the number of matching severity classes between the two dfs are maximised
    '''
    cutoff_score_accuracy = {'lavertu_cutoff': [],
                             'gottlieb_cutoff': [],
                             'number_different': []
                              }

    for lavertu_cutoff in np.linspace(*cutoff_range, 50):
        # assigning side effects with scores < lavertu_cutoff with 0, and those with scores > lavertu_cutoff as 1
        lavertu_classes = merged_df_no_na['score_lavertu'].apply(lambda x: 0 if x < lavertu_cutoff else 1)
        for gottlieb_cutoff in np.linspace(*cutoff_range, 50):
            # assigning side effects with scores < gottlieb_cutoff with 0, and those with scores > gottlieb_cutoff as 1
            gottlieb_classes = merged_df_no_na['score_gottlieb'].apply(lambda x: 0 if x < gottlieb_cutoff else 1)

            # if the recategorised side effects for both lavertu and gottlieb fall in the same category (0 or 1),
            # then subtracting them from one another will give a value of 0. If the recategorised side effects are
            # of different categories, subtracting them from one another will give a value of 1 after applying modulus
            # summing the numbers up afterwards will yield the total number of differently classified pairs.
            number_different = (lavertu_classes - gottlieb_classes).abs().sum()

            cutoff_score_accuracy['lavertu_cutoff'].append(lavertu_cutoff)
            cutoff_score_accuracy['gottlieb_cutoff'].append(gottlieb_cutoff)
            cutoff_score_accuracy['number_different'].append(number_different) 
        
    return pd.DataFrame(cutoff_score_accuracy).sort_values('number_different')