# Objective

# Packages

In [89]:
import pandas as pd 
import numpy as np 
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from itertools import compress
from matplotlib.ticker import PercentFormatter
from matplotlib_venn import venn3, venn2, venn2_circles
%matplotlib inline

# Datasets

In [3]:
record = pd.read_pickle('../primary_care/records2.pkl')

In [4]:
pri_cli = pd.read_csv('../primary_care/gp_clinical.txt', sep = '\t', encoding='ISO-8859-1')

In [14]:
diag = pd.read_excel('../primary_care/diagnosis_cvd.xlsx')

# Specifying dataset

In [10]:
rec = record[(record['discrepancy'] == False) & (record['event_int'] > 0)]

In [11]:
patients = list(rec['eid'].values)

In [12]:
pri = pri_cli[pri_cli['eid'].isin(patients)].reset_index()

# Diagnosis

In [15]:
diag

Unnamed: 0,CHAPTER,READV2_CODE,READV2_DESC,TERMV2_DESC,TERMV2_ORDER,TERMV2_TYPE,READV3_CODE,TERMV3_CODE,TERMV3_TYPE,TERMV3_DESC,IS_ASSURED,diagnosis
0,6,6879.,Hyperlipidaemia screen,Hyperlipidaemia screen,0,P,XE1Td,YaemR,P,Hyperlipidaemia screening,1,hyperlipidaemia
1,C,C3202,"Hyperlipidaemia, group A","Hyperlipidaemia, group A",0,P,C3202,Y41wk,P,"Hyperlipidaemia, group A",1,hyperlipidaemia
2,C,C3220,Familial combined hyperlipidaemia,Familial combined hyperlipidaemia,0,P,X40Vm,Y41se,P,Familial combined hyperlipidaemia,1,hyperlipidaemia
3,C,C324.,Hyperlipidaemia NOS,Hyperlipidaemia NOS,0,P,C324.,Y41wh,P,Hyperlipidaemia NOS,1,hyperlipidaemia
4,C,C322.,Mixed hyperlipidaemia,Mixed hyperlipidaemia,0,P,XE11U,Y41xO,P,Mixed hyperlipidaemia,1,hyperlipidaemia
...,...,...,...,...,...,...,...,...,...,...,...,...
467,C,C321.,Pure hyperglyceridaemia,Pure hyperglyceridaemia,0,P,XE11T,Y41x7,P,Pure hyperglyceridaemia,1,hyperlipidaemia
468,C,C3210,Hypertriglyceridaemia,Hypertriglyceridaemia,0,P,Xa9At,Y41x6,P,Hypertriglyceridaemia,1,hyperlipidaemia
469,C,C322.,Mixed hyperlipidaemia,Mixed hyperlipidaemia,0,P,XE11U,Y41xO,P,Mixed hyperlipidaemia,1,hyperlipidaemia
470,C,C328.,Dyslipidaemia,Dyslipidaemia,0,P,XaL5p,YarER,P,Dyslipidaemia,1,hyperlipidaemia


In [16]:
diag['diagnosis'].unique()

array(['hyperlipidaemia', 'hypertension', 'PAD', 'CKD', 'diabetes'],
      dtype=object)

In [17]:
diagnosis_codes = {
    'hyperlipidaemia': {
        'read2': list(diag[diag['diagnosis'] == 'hyperlipidaemia']['READV2_CODE']),
        'read3': list(diag[diag['diagnosis'] == 'hyperlipidaemia']['READV3_CODE'])
    },
    'hypertension': {
        'read2': list(diag[diag['diagnosis'] == 'hypertension']['READV2_CODE']),
        'read3': list(diag[diag['diagnosis'] == 'hypertension']['READV3_CODE'])
    }, 
    'PAD': {
        'read2': list(diag[diag['diagnosis'] == 'PAD']['READV2_CODE']),
        'read3': list(diag[diag['diagnosis'] == 'PAD']['READV3_CODE'])
    },
    'CKD': {
        'read2': list(diag[diag['diagnosis'] == 'CKD']['READV2_CODE']),
        'read3': list(diag[diag['diagnosis'] == 'CKD']['READV3_CODE'])
    },
    'diabetes': {
        'read2': list(diag[diag['diagnosis'] == 'diabetes']['READV2_CODE']),
        'read3': list(diag[diag['diagnosis'] == 'diabetes']['READV3_CODE'])
    }, 
    'all': {
        'read2': list(diag['READV2_CODE']),
        'read3': list(diag['READV3_CODE'])
    }
}

In [27]:
diag_pri = pri[pri['read_2'].isin(diagnosis_codes['all']['read2'])| pri['read_3'].isin(diagnosis_codes['all']['read3'])]

In [28]:
diag_pri.drop('index', axis = 1, inplace = True)

In [33]:
diag_pri.drop('event_dt', axis = 1, inplace = True)

In [35]:
diag_pri.drop_duplicates(keep = 'first', inplace = True)


In [40]:
new_diag_pri = diag_pri.groupby('eid').agg(list).reset_index()

In [41]:
new_diag_pri

Unnamed: 0,eid,data_provider,read_2,read_3,value1,value2,value3
0,1000530,[3],[nan],[XE0Uc],[nan],[nan],[nan]
1,1001624,[1],[C324.],[nan],[nan],[nan],[nan]
2,1003479,[3],[nan],[XE0Uc],[nan],[nan],[nan]
3,1004792,"[3, 3]","[nan, nan]","[Xa9As, X40J5]","[nan, nan]","[nan, nan]","[nan, nan]"
4,1009010,"[3, 3, 3, 3, 3, 3]","[nan, nan, nan, nan, nan, nan]","[C10.., X40J5, XaLHI, XaO3u, X40J6, X30In]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
...,...,...,...,...,...,...,...
2653,6015613,"[3, 3, 3]","[nan, nan, nan]","[X40J5, Xa9As, X40J6]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]"
2654,6015854,[3],[nan],[XE0Uc],[nan],[nan],[nan]
2655,6019332,[3],[nan],[XE0Ud],[nan],[nan],[nan]
2656,6020725,"[3, 3, 3]","[nan, nan, nan]","[XE0Uc, XE11S, XE11U]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]"


In [45]:
type(new_diag_pri.iloc[0]['read_2'][0])

float

# diabetes

In [64]:
diabetes = pri[(pri['read_2'].isin(diagnosis_codes['diabetes']['read2']))|(pri['read_3'].isin(diagnosis_codes['diabetes']['read3']))]

In [65]:
diabetes_df = diabetes.groupby('eid').agg(list).reset_index()

In [66]:
diabetics = list(diabetes_df.eid)

# CKD

In [67]:
CKD = pri[(pri['read_2'].isin(diagnosis_codes['CKD']['read2']))|(pri['read_3'].isin(diagnosis_codes['CKD']['read3']))]


In [68]:
CKD_df = CKD.groupby('eid').agg(list).reset_index()

In [69]:
chronic = list(CKD_df.eid)

# PAD

In [70]:
PAD = pri[(pri['read_2'].isin(diagnosis_codes['PAD']['read2']))|(pri['read_3'].isin(diagnosis_codes['PAD']['read3']))]


In [71]:
PAD_df = PAD.groupby('eid').agg(list).reset_index()

In [72]:
peripheral = list(PAD_df.eid)

# Hypertension

In [74]:
hypertension = pri[(pri['read_2'].isin(diagnosis_codes['hypertension']['read2']))|(pri['read_3'].isin(diagnosis_codes['hypertension']['read3']))]


In [75]:
 hypertension_df = hypertension.groupby('eid').agg(list).reset_index()

In [76]:
hypertensives = list(hypertension_df.eid)

# Hyperlipidaemia

In [77]:
hyperlipid = pri[(pri['read_2'].isin(diagnosis_codes['hyperlipidaemia']['read2']))|(pri['read_3'].isin(diagnosis_codes['hyperlipidaemia']['read3']))]


In [78]:
hyperlipid_df = hyperlipid.groupby('eid').agg(list).reset_index()

In [79]:
hyperchol= list(hyperlipid_df.eid)

 # Gathering

In [101]:
rec['diabetic'] = ['diabetic' if x in diabetics else "" for x in rec['eid'] ]

In [102]:
rec['CKD'] = ['CKD' if x in chronic else "" for x in rec['eid']]

In [103]:
rec['PAD'] = ['peripheral' if x in peripheral else "" for x in rec['eid']]

In [104]:
rec['hypertension'] = ['hypertensives' if x in hypertensives else "" for x in rec['eid']]

In [105]:
rec['hyperlipidaemia'] = ['hyperchol' if x in hyperchol else "" for x in rec['eid']]

In [106]:
concern = ['diabetic', 'CKD', 'PAD', 'hypertension', 'hyperlipidaemia']

In [107]:
for x in concern: 
    print(rec[x].value_counts())

3171
diabetic     853
Name: diabetic, dtype: int64
       3239
CKD     785
Name: CKD, dtype: int64
              3974
peripheral      50
Name: PAD, dtype: int64
                 2204
hypertensives    1820
Name: hypertension, dtype: int64
             3100
hyperchol     924
Name: hyperlipidaemia, dtype: int64


In [108]:
rec['noconcern'] = rec['diabetic'] + rec['CKD'] + rec['PAD'] + rec['hypertension'] + rec['hyperlipidaemia']

In [110]:
temp = pd.DataFrame(rec.noconcern.value_counts())

In [112]:
temp.to_csv('../primary_care/rec_diagnosis.csv')