In [4]:
import pandas as pd
import numpy as np

from datetime import timedelta
from itertools import compress

## Load dataset

In [47]:
sec_cli = pd.read_csv('../primary_care/ukb41199.csv', 
                      usecols=[0, 6409, 6410, 6411, 6412, 6381, 6382, 6383, 6384, 6299, 6300, 6301, 6302, 6369, 6370, 6371, 6372, 6355, 6356, 6357, 6358])

  interactivity=interactivity, compiler=compiler, result=result)


In [48]:
sec_cli.columns

Index(['eid', '30700-0.0', '30700-1.0', '30701-0.0', '30701-1.0', '30740-0.0',
       '30740-1.0', '30741-0.0', '30741-1.0', '30750-0.0', '30750-1.0',
       '30751-0.0', '30751-1.0', '30760-0.0', '30760-1.0', '30761-0.0',
       '30761-1.0', '30780-0.0', '30780-1.0', '30781-0.0', '30781-1.0'],
      dtype='object')

In [7]:
sec_cli.columns = ['eid', 'Creat1', 'Creat2', 'Creat1date', 'Creat2date', 'gluc1', 'gluc2', 'gluc1date', 'gluc2date', 'hba1c1', 'hba1c2','hba1c1date', 'hba1c2date',  'HDL1', 'HDL2', 'HDL1date', 'HDL2date', 'LDL1', 'LDL2', 'LDL1date', 'LDL2date']

In [8]:
records = pd.read_pickle('../primary_care/records.pkl')

In [None]:
pri_cli = pd.read_csv('../primary_care/gp_clinical.txt', 
                      sep = '\t', encoding='ISO-8859-1')

In [None]:
biomarker = pd.read_excel('../primary_care/ProposedDiagnosisCodes.xlsx', sheet_name='diag')

## Get codes

In [9]:
biomarker_codes = {
    'LDL': {
        'read2': ['44P6.', '44PI.', '44dB.', '44d5.', '44D5.', '44R4.'],
        'read3': ['44P6.', 'XaEVs', 'Xalp4', '44R4.']
    },
    'HDL': {
        'read2': ['44P5.', '44PB.', '44PC.'],#, '44R3.'],
        'read3': ['44P5.', '44PB', '44PC']#, '44R3']
    },
    'Hba1c': {
        'read2': ['42W4.'],
        'read3': ['XaJPJ']
    },
    'Creatinine': {
        'read2': ['44J3.', '44JF.', '4Q40.'],
        'read3': ['XE2q5', 'XaETQ', 'X771N']
    }
}

## Specify dataset

In [12]:
records_full = records[(records['noRecords'] > 1) & (records['first_stroke_after_d'].isnull() == False) & (records['discrepancy'] == False)]

In [13]:
records_full.shape

(5864, 14)

In [14]:
patients = list(records_full['eid'].values)

## Revised records to include those without dates

In [14]:
pri = pri_cli[pri_cli['eid'].isin(patients)].reset_index()

In [15]:
sec = sec_cli[sec_cli['eid'].isin(patients)].reset_index()

In [16]:
first_event = records_full[['eid', 'first_record_d']]

## Get helper funcs

In [17]:
def get_event_type(date, first_record_d):
    diff = (date - first_record_d).days

    if diff == 0:
        return 'index'
    
    elif diff < 0:
        if abs(diff) <= 14:
            return 'before_<=14'
        elif abs(diff) <= 180:
            return 'before_>14,<=180'
        else:
            return 'before_>180'
    
    elif diff > 0:
        if abs(diff) <= 14:
            return 'after_<=14'
        elif abs(diff) <= 180:
            return 'after_>14,<=180'
        else:
            return 'after_>180'

def verify_float(x):
    try:
        x = float(x)
        if x > 0.0:
            return True
        else:
            return False
    except ValueError:
        return False

In [18]:
def get_event_type_creat(date, first_record_d):
    diff = (date - first_record_d).days

    if diff == 0:
        return 'index'
    
    elif diff < 0:
        if abs(diff) <= 14:
            return 'before_<=14'
        elif abs(diff) <= 90:
            return 'before_>14,<=90'
        else:
            return 'before_>90'
    
    elif diff > 0:
        if abs(diff) <= 14:
            return 'after_<=14'
        elif abs(diff) <= 90:
            return 'after_>14,<=90'
        else:
            return 'after_>90'

## Get LDL

### Primary

In [104]:
LDL = pri[pri['read_2'].isin(biomarker_codes['LDL']['read2'])| pri['read_3'].isin(biomarker_codes['LDL']['read3'])]

In [105]:
LDL['value3'] = LDL['value3'].replace('mmol/l', 'mmol/L')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [106]:
LDL['value'] = LDL.apply(lambda x: list(compress([x['value1'], x['value2'], x['value3']], list(map(verify_float, [x['value1'], x['value2'], x['value3']])))), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [107]:
LDL['value'] = LDL['value'].map(lambda x: float(x[0]) if len(x) > 0 else np.nan)

LDL = LDL[~LDL['value'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [108]:
LDL['event_dt'] = pd.to_datetime(LDL['event_dt'])

In [109]:
new_LDL = pd.merge(LDL, first_event, how = 'left', on = 'eid')

In [110]:
new_LDL['event_type'] = new_LDL.apply(lambda x: get_event_type(x['event_dt'], x['first_record_d']), axis=1)

In [111]:
new_LDL['value3'].unique()

array([nan, 'MEA096', 'MEA000', 'mmol/L', 'Unknown', 'MEA151'],
      dtype=object)

In [112]:
def get_LDL_value3type(value3):
    value3 = str(value3)
    if 'MEA' in value3:
        return 'MEA'
    elif value3 in ['nan', 'Unknown']:
        return 'Unknown'
    else:
        return value3
new_LDL['value3_'] = new_LDL['value3'].map(get_LDL_value3type)

In [113]:
new_LDL['value3_'].unique()

array(['Unknown', 'MEA', 'mmol/L'], dtype=object)

In [114]:
new_LDL.groupby(['value3_', 'event_type'])['eid'].count()

value3_  event_type      
MEA      after_<=14              5
         after_>14,<=180        55
         after_>180           1576
         before_<=14             5
         before_>14,<=180       28
         before_>180           249
Unknown  after_<=14             98
         after_>14,<=180       965
         after_>180          20133
         before_<=14            63
         before_>14,<=180      543
         before_>180          5475
         index                  25
mmol/L   after_<=14              3
         after_>14,<=180        38
         after_>180            867
         before_<=14             6
         before_>14,<=180       14
         before_>180           116
         index                   1
Name: eid, dtype: int64

In [115]:
new_LDL.groupby(['value3_', 'event_type'])['eid'].nunique()

value3_  event_type      
MEA      after_<=14             5
         after_>14,<=180       49
         after_>180           297
         before_<=14            5
         before_>14,<=180      26
         before_>180           87
Unknown  after_<=14            96
         after_>14,<=180      795
         after_>180          2987
         before_<=14           60
         before_>14,<=180     477
         before_>180         1471
         index                 24
mmol/L   after_<=14             3
         after_>14,<=180       30
         after_>180           243
         before_<=14            5
         before_>14,<=180      12
         before_>180           38
         index                  1
Name: eid, dtype: int64

### Secondary

In [49]:
LDL1 = sec[['eid', 'LDL1', 'LDL1date']]
LDL2 = sec[['eid', 'LDL2', 'LDL2date']]

In [50]:
LDL1.dropna(axis = 0, inplace = True)
LDL2.dropna(axis = 0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [51]:
LDL1['LDL1date'] = pd.to_datetime(LDL1['LDL1date'])
LDL2['LDL2date'] = pd.to_datetime(LDL2['LDL2date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [52]:
LDL1.columns = ['eid', 'LDL', 'date']
LDL2.columns = ['eid', 'LDL', 'date']

In [53]:
LDL1 = LDL1.append(LDL2)

In [54]:
LDL_sec = pd.merge(LDL1, first_event, on = 'eid', how = 'left')

In [55]:
LDL_sec['event_type'] = LDL_sec.apply(lambda x: get_event_type(x['date'], x['first_record_d']), axis=1)

In [56]:
LDL_sec.groupby(['event_type'])['eid'].count()

event_type
after_<=14             4
after_>14,<=180       50
after_>180          5530
before_<=14            1
before_>14,<=180      16
before_>180           12
Name: eid, dtype: int64

In [57]:
LDL_sec.groupby(['event_type'])['eid'].nunique()

event_type
after_<=14             4
after_>14,<=180       50
after_>180          5409
before_<=14            1
before_>14,<=180      15
before_>180           12
Name: eid, dtype: int64

## HDL

### Primary

In [125]:
HDL = pri[pri['read_2'].isin(biomarker_codes['HDL']['read2'])| pri['read_3'].isin(biomarker_codes['HDL']['read3'])]

In [126]:
HDL['value'] = HDL.apply(lambda x: list(compress([x['value1'], x['value2'], x['value3']], list(map(verify_float, [x['value1'], x['value2'], x['value3']])))), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [127]:
HDL['value'] = HDL['value'].map(lambda x: round(float(x[0]),1) if len(x) > 0 else np.nan)

HDL = HDL[~HDL['value'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [128]:
new_HDL = pd.merge(HDL, first_event, how = 'left', on = 'eid')

In [129]:
new_HDL['event_dt'] = pd.to_datetime(new_HDL['event_dt'])

In [130]:
new_HDL['event_type'] = new_HDL.apply(lambda x: get_event_type(x['event_dt'], x['first_record_d']), axis=1)

In [131]:
new_HDL['value3'].unique()

array([nan, 'MEA096', 'MEA000', 'mmol/L', 'mmol/l', 'Unknown', 'MEA151'],
      dtype=object)

In [132]:
def get_HDL_value3type(value3):
    value3 = str(value3)
    if 'MEA' in value3:
        return 'MEA'
    elif value3 in ['nan', 'Unknown']:
        return 'Unknown'
    elif value3 in ['mmol/L', 'mmol/l']:
        return 'mmol/L'
new_HDL['value3_'] = new_HDL['value3'].map(get_HDL_value3type)

In [133]:
new_HDL['value3_'].unique()

array(['Unknown', 'MEA', 'mmol/L'], dtype=object)

In [134]:
new_HDL.groupby(['value3_', 'event_type'])['eid'].count()

value3_  event_type      
MEA      after_<=14              7
         after_>14,<=180        81
         after_>180           2173
         before_<=14             6
         before_>14,<=180       43
         before_>180           389
         index                   1
Unknown  after_<=14            146
         after_>14,<=180      1341
         after_>180          29678
         before_<=14            97
         before_>14,<=180      788
         before_>180          7576
         index                  37
mmol/L   after_<=14              4
         after_>14,<=180        60
         after_>180           1410
         before_<=14             8
         before_>14,<=180       26
         before_>180           205
         index                   1
Name: eid, dtype: int64

In [135]:
new_HDL.groupby(['value3_', 'event_type'])['eid'].nunique()

value3_  event_type      
MEA      after_<=14             7
         after_>14,<=180       71
         after_>180           319
         before_<=14            6
         before_>14,<=180      37
         before_>180          115
         index                  1
Unknown  after_<=14           141
         after_>14,<=180     1087
         after_>180          3742
         before_<=14           92
         before_>14,<=180     677
         before_>180         1815
         index                 36
mmol/L   after_<=14             4
         after_>14,<=180       49
         after_>180           270
         before_<=14            7
         before_>14,<=180      19
         before_>180           64
         index                  1
Name: eid, dtype: int64

### Secondary

In [58]:
HDL1 = sec[['eid', 'HDL1', 'HDL1date']]
HDL2 = sec[['eid', 'HDL2', 'HDL2date']]

In [59]:
HDL1.dropna(axis = 0, inplace = True)
HDL2.dropna(axis = 0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [60]:
HDL1['HDL1date'] = pd.to_datetime(HDL1['HDL1date'])
HDL2['HDL2date'] = pd.to_datetime(HDL2['HDL2date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [61]:
HDL1.columns = ['eid', 'HDL', 'date']
HDL2.columns = ['eid', 'HDL', 'date']

In [62]:
HDL1 = HDL1.append(HDL2)

In [63]:
HDL_sec = pd.merge(HDL1, first_event, on = 'eid', how = 'left')

In [64]:
HDL_sec['event_type'] = HDL_sec.apply(lambda x: get_event_type(x['date'], x['first_record_d']), axis=1)

In [65]:
HDL_sec.groupby(['event_type'])['eid'].count()

event_type
after_<=14             4
after_>14,<=180       44
after_>180          5104
before_<=14            1
before_>14,<=180       9
before_>180           10
Name: eid, dtype: int64

In [66]:
HDL_sec.groupby(['event_type'])['eid'].nunique()

event_type
after_<=14             4
after_>14,<=180       44
after_>180          5007
before_<=14            1
before_>14,<=180       9
before_>180           10
Name: eid, dtype: int64

## Creatinine

In [145]:
creat = pri[pri['read_2'].isin(biomarker_codes['Creatinine']['read2'])| pri['read_3'].isin(biomarker_codes['Creatinine']['read3'])]

In [146]:
replace_dict= {
    'Âµmol/L': 'umol/L',
    'µmol/L': 'umol/L', 
    'micmol/l': 'umol/L',
    'umol/l': 'umol/L',
    'mmol/24h': 'mmol/24hr',
    'mmol/volume': 'mmol/vol',
    'm1/min': 'ml/min',
    'nmol/l': 'nmol/L',
    'mmol/l': 'mmol/L'
    }

In [147]:
creat['value3'] = creat['value3'].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [148]:
creat['value'] = creat.apply(lambda x: list(compress([x['value1'], x['value2'], x['value3']], list(map(verify_float, [x['value1'], x['value2'], x['value3']])))), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [149]:
creat['value'] = creat['value'].map(lambda x: round(float(x[0]),1) if len(x) > 0 else np.nan)

creat = creat[~creat['value'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [150]:
new_creat = pd.merge(creat, first_event, how = 'left', on = 'eid')

In [151]:
new_creat['event_dt'] = pd.to_datetime(new_creat['event_dt'])

In [190]:
new_creat['event_type'] = new_creat.apply(lambda x: get_event_type_creat(x['event_dt'], x['first_record_d']), axis=1)

In [191]:
new_creat['value3'].unique()

array([nan, 'MEA142', 'MEA000', 'umol/L', 'Unknown', 'mmol/L', 'MEA096'],
      dtype=object)

In [192]:
def get_creat_value3type(value3):
    value3 = str(value3)
    if 'MEA' in value3:
        return 'MEA'
    elif value3 in ['nan', 'Unknown']:
        return 'Unknown'
    elif value3 in ['mmol/L', 'mmol/l']:
        return 'mmol/L'
    else:
        return value3
new_creat['value3_'] = new_creat['value3'].map(get_creat_value3type)

In [193]:
new_creat['value3_'].unique()

array(['Unknown', 'MEA', 'umol/L', 'mmol/L'], dtype=object)

In [194]:
new_creat.groupby(['value3_', 'event_type'])['eid'].count()

value3_  event_type     
MEA      after_<=14            16
         after_>14,<=90        86
         after_>90           5434
         before_<=14           20
         before_>14,<=90       56
         before_>90           938
         index                  5
Unknown  after_<=14           232
         after_>14,<=90      1125
         after_>90          53011
         before_<=14          167
         before_>14,<=90      696
         before_>90         14389
         index                 63
mmol/L   before_>90             1
umol/L   after_<=14            18
         after_>14,<=90        76
         after_>90           3700
         before_<=14           13
         before_>14,<=90       37
         before_>90           492
         index                  4
Name: eid, dtype: int64

In [195]:
new_creat.groupby(['value3_', 'event_type'])['eid'].nunique()

value3_  event_type     
MEA      after_<=14           15
         after_>14,<=90       62
         after_>90           376
         before_<=14          17
         before_>14,<=90      44
         before_>90          137
         index                 5
Unknown  after_<=14          217
         after_>14,<=90      873
         after_>90          3981
         before_<=14         154
         before_>14,<=90     577
         before_>90         2228
         index                58
mmol/L   before_>90            1
umol/L   after_<=14           10
         after_>14,<=90       46
         after_>90           308
         before_<=14          12
         before_>14,<=90      27
         before_>90           96
         index                 4
Name: eid, dtype: int64

### Secondary

In [67]:
creat1 = sec[['eid', 'Creat1', 'Creat1date']]
creat2 = sec[['eid', 'Creat2', 'Creat2date']]

In [68]:
creat1.dropna(axis = 0, inplace = True)
creat2.dropna(axis = 0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [69]:
creat1['Creat1date'] = pd.to_datetime(creat1['Creat1date'])
creat2['Creat2date'] = pd.to_datetime(creat2['Creat2date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [70]:
creat1.columns = ['eid', 'creat', 'date']
creat2.columns = ['eid', 'creat', 'date']

In [71]:
creat1 = creat1.append(creat2)

In [72]:
creat_sec = pd.merge(creat1, first_event, on = 'eid', how = 'left')

In [73]:
creat_sec['event_type'] = creat_sec.apply(lambda x: get_event_type_creat(x['date'], x['first_record_d']), axis=1)

In [76]:
creat_sec

Unnamed: 0,eid,creat,date,first_record_d,event_type
0,1000421,88.7,2017-09-07,2010-05-27,after_>90
1,1000530,76.4,2016-12-05,2012-03-26,after_>90
2,1001099,71.8,2016-10-14,2003-01-01,after_>90
3,1001327,79.7,2016-10-15,2016-03-28,after_>90
4,1001624,66.0,2016-03-05,2001-01-01,after_>90
...,...,...,...,...,...
5618,5860992,73.8,2016-05-19,2014-09-13,after_>90
5619,5876689,86.0,2016-09-23,2000-02-16,after_>90
5620,5897103,94.3,2016-03-15,2012-05-05,after_>90
5621,5949859,71.9,2016-05-28,2009-03-11,after_>90


In [74]:
creat_sec.groupby(['event_type'])['eid'].count()

event_type
after_<=14            4
after_>14,<=90       19
after_>90          5571
before_<=14           1
before_>14,<=90      10
before_>90           18
Name: eid, dtype: int64

In [75]:
creat_sec.groupby(['event_type'])['eid'].nunique()

event_type
after_<=14            4
after_>14,<=90       19
after_>90          5449
before_<=14           1
before_>14,<=90      10
before_>90           18
Name: eid, dtype: int64

## HbA1c

In [167]:
hba1c = pri[pri['read_2'].isin(biomarker_codes['Hba1c']['read2'])| pri['read_3'].isin(biomarker_codes['Hba1c']['read3'])]

In [168]:
hba1c['value'] = hba1c.apply(lambda x: list(compress([x['value1'], x['value2'], x['value3']], list(map(verify_float, [x['value1'], x['value2'], x['value3']])))), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [169]:
hba1c['value'] = hba1c['value'].map(lambda x: round(float(x[0]),1) if len(x) > 0 else np.nan)

hba1c = hba1c[~hba1c['value'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [170]:
hba1c['event_dt'] = pd.to_datetime(hba1c['event_dt'])

In [171]:
new_hba1c = pd.merge(hba1c, first_event, how = 'left', on = 'eid')

In [172]:
replacehb_dict= {
    '%total Hb': '%',
    '% total Hb': '%', 
    '%Total Hb': '%'
    }

In [173]:
new_hba1c['value3'] = new_hba1c ['value3'].replace(replacehb_dict)

In [174]:
new_hba1c['event_type'] = new_hba1c.apply(lambda x: get_event_type(x['event_dt'], x['first_record_d']), axis=1)

In [175]:
new_hba1c['value3'].unique()

array([nan, 'MEA001', 'MEA215', '%', 'Unknown', 'HbA1c', 'MEA000',
       'mmol/mol', 'mol/l', 'MEA097'], dtype=object)

In [176]:
def get_hba1c_value3type(value3):
    value3 = str(value3)
    if 'MEA' in value3:
        return 'MEA'
    elif value3 in ['nan', 'Unknown']:
        return 'Unknown'
    elif value3 in ['mmol/L', 'mmol/l']:
        return 'mmol/L'
    else:
        return value3
new_hba1c['value3_'] = new_hba1c['value3'].map(get_hba1c_value3type)

In [177]:
new_hba1c['value3_'].unique()

array(['Unknown', 'MEA', '%', 'HbA1c', 'mmol/mol', 'mol/l'], dtype=object)

In [178]:
new_hba1c.groupby(['value3_', 'event_type'])['eid'].count()

value3_   event_type      
%         after_>14,<=180       6
          after_>180          114
          before_>14,<=180      3
          before_>180          48
HbA1c     after_>180            5
          before_>180           4
MEA       after_>14,<=180       1
          after_>180           50
          before_<=14           1
          before_>14,<=180      2
          before_>180           8
Unknown   after_<=14            2
          after_>14,<=180      27
          after_>180          725
          before_<=14           1
          before_>14,<=180     23
          before_>180         259
mmol/mol  after_>180            1
mol/l     after_>180            1
Name: eid, dtype: int64

In [179]:
new_hba1c.groupby(['value3_', 'event_type'])['eid'].nunique()

value3_   event_type      
%         after_>14,<=180       5
          after_>180           38
          before_>14,<=180      2
          before_>180          12
HbA1c     after_>180            4
          before_>180           1
MEA       after_>14,<=180       1
          after_>180           18
          before_<=14           1
          before_>14,<=180      1
          before_>180           6
Unknown   after_<=14            2
          after_>14,<=180      20
          after_>180          142
          before_<=14           1
          before_>14,<=180     19
          before_>180          59
mmol/mol  after_>180            1
mol/l     after_>180            1
Name: eid, dtype: int64

### Secondary

In [180]:
hba1c1 = sec[['eid', 'hba1c1', 'hba1c1date']]
hba1c2 = sec[['eid', 'hba1c2', 'hba1c2date']]

hba1c1.dropna(axis = 0, inplace = True)
hba1c2.dropna(axis = 0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [181]:
hba1c1['hba1c1'] = ((hba1c1['hba1c1']/10.929) + 2.15)
hba1c2['hba1c2'] = ((hba1c2['hba1c2']/10.929) + 2.15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [182]:
hba1c1['hba1c1date'] = pd.to_datetime(hba1c1['hba1c1date'])
hba1c2['hba1c2date'] = pd.to_datetime(hba1c2['hba1c2date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [183]:
hba1c1.columns = ['eid', 'hba1c', 'date']
hba1c2.columns = ['eid', 'hba1c', 'date']

In [184]:
hba1c1 = hba1c1.append(hba1c2)

In [185]:
hba1c_sec = pd.merge(hba1c1, first_event, on = 'eid', how = 'left')

In [186]:
hba1c_sec['event_type'] = hba1c_sec.apply(lambda x: get_event_type(x['date'], x['first_record_d']), axis=1)

In [187]:
hba1c_sec.groupby(['event_type'])['eid'].count()

event_type
after_<=14             4
after_>14,<=180       44
after_>180          5104
before_<=14            1
before_>14,<=180       9
before_>180           10
Name: eid, dtype: int64

In [188]:
hba1c_sec.groupby(['event_type'])['eid'].nunique()

event_type
after_<=14             4
after_>14,<=180       44
after_>180          5007
before_<=14            1
before_>14,<=180       9
before_>180           10
Name: eid, dtype: int64