# Genentech Cervical Cancer - Big Table Merge

https://www.kaggle.com/c/cervical-cancer-screening/

In [1]:
# imports
import sys # for stderr
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# settings 
%logstop
%logstart  -o 'ipynb.log' rotate
plt.style.use('ggplot')
# constants
# plt.rcParams['figure.figsize'] = (10.0, 10.0)
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', 50)

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : ipynb.log
Mode           : rotate
Output logging : True
Raw input log  : False
Timestamping   : False
State          : active


In [3]:
# versions 
import sys
print(pd.datetime.now())
print('Python: '+sys.version)
print('numpy: '+np.__version__)
print('pandas: '+pd.__version__)
print('sklearn: '+skl.__version__)

2016-01-23 17:46:02.310811
Python: 2.7.11 |Anaconda 2.4.0 (x86_64)| (default, Dec  6 2015, 18:57:58) 
[GCC 4.2.1 (Apple Inc. build 5577)]
numpy: 1.10.2
pandas: 0.17.1
sklearn: 0.17


## Load Train/Test

In [4]:
train_file = './input/patients_train.csv.gz'
train = pd.read_csv(train_file)
train.drop('patient_gender', axis = 1, inplace = True )

In [5]:
train.shape

(1476637, 7)

In [6]:
train.set_index('patient_id', inplace=True)
train[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1
373337412,21-23,LA,ALL OTHER,UNKNOWN,UNKNOWN,1
368845412,12-14,OH,ALL OTHER,UNKNOWN,UNKNOWN,0


In [7]:
train_exclude = pd.read_csv('./input/train_patients_to_exclude.csv', header=None, names=['patient_id'])
train.drop(train_exclude.patient_id, inplace=True)
train.shape

(1157817, 6)

In [8]:
test_file = './input/patients_test.csv.gz'
test = pd.read_csv(test_file)
test.drop( 'patient_gender', axis = 1, inplace = True )

In [9]:
test.set_index('patient_id', inplace=True)
test[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN
124284812,15-17,LA,ALL OTHER,UNKNOWN,UNKNOWN


In [10]:
test_exclude = pd.read_csv('./input/test_patients_to_exclude.csv', header=None, names=['patient_id'])
test.drop(test_exclude.patient_id, inplace=True)
test.shape

(1701813, 5)

## Load Features

In [11]:
def fmerge(df, filename):
    feature = pd.read_csv(filename)
    feature.set_index('patient_id', inplace=True)
    return pd.merge(df, feature, left_index=True, right_index=True, how ='left')

def fjoin(df, filename, on_field):
    feature = pd.read_csv(filename)
    feature.set_index(on_field, inplace=True)
    return df.join(feature, on=on_field)    

In [12]:
# Train Files
fdir = './features/'
visits = fdir+'visits.csv.gz'
train_diagnosis_cbsa_counts = fdir+'train_diagnosis_cbsa_counts.csv.gz'
train_patient_cbsa = fdir+'train_patient_cbsa.csv.gz'
train_procedure_counts = fdir+'train_procedure_counts.csv.gz'
train_surgical_claim_type = fdir+'train_surgical_claim_type.csv.gz'
train_surgical_place_of_service = fdir+'train_surgical_place_of_service.csv.gz'
train_surgical_primary_physician_role = fdir+'train_surgical_primary_physician_role.csv.gz'
train_surgical_procedure_type_code = fdir+'train_surgical_procedure_type_code.csv.gz'

In [13]:
train_features = [
    visits, 
    train_patient_cbsa,
    train_procedure_counts,
    train_surgical_claim_type,
    train_surgical_place_of_service,
    train_surgical_primary_physician_role,
    train_surgical_procedure_type_code
]

In [14]:
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1


In [15]:
for f in train_features:
    train = fmerge(train, f)

In [16]:
import gc
gc.collect()

204

In [17]:
train.shape

(1157817, 32)

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1157817 entries, 336201912 to 222924424
Data columns (total 32 columns):
patient_age_group    1157817 non-null object
patient_state        1157817 non-null object
ethinicity           1157817 non-null object
household_income     1157817 non-null object
education_level      1157817 non-null object
is_screener          1157817 non-null int64
visits               1157817 non-null int64
cbsa                 1156893 non-null float64
num_visits           1156893 non-null float64
num_procedures       1157817 non-null int64
HX                   488279 non-null float64
CLINIC               488279 non-null float64
INPATIENT            488279 non-null float64
OTHER                488279 non-null float64
OUTPATIENT           488279 non-null float64
UNKNOWN              488279 non-null float64
ATG                  488279 non-null float64
OPR                  488279 non-null float64
OTH                  488279 non-null float64
None                 48

In [19]:
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,0003,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,,,,,,,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,,,,,,,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,0.0,0.0,0.0,0.0,6.0,4.0,0.0,0.0,0.0,14.0


In [20]:
# Test Files
fdir = './features/'
visits = fdir+'visits.csv.gz'
test_diagnosis_cbsa_counts = fdir+'test_diagnosis_cbsa_counts.csv.gz'
test_patient_cbsa = fdir+'test_patient_cbsa.csv.gz'
test_procedure_counts = fdir+'test_procedure_counts.csv.gz'
test_surgical_claim_type = fdir+'test_surgical_claim_type.csv.gz'
test_surgical_place_of_service = fdir+'test_surgical_place_of_service.csv.gz'
test_surgical_primary_physician_role = fdir+'test_surgical_primary_physician_role.csv.gz'
test_surgical_procedure_type_code = fdir+'test_surgical_procedure_type_code.csv.gz'

In [21]:
test_features = [
    visits, 
    test_patient_cbsa,
    test_procedure_counts,
    test_surgical_claim_type,
    test_surgical_place_of_service,
    test_surgical_primary_physician_role,
    test_surgical_procedure_type_code
]

In [22]:
for f in test_features:
    test = fmerge(test, f)

In [23]:
test.shape

(1701813, 31)

In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1701813 entries, 148341312 to 204245024
Data columns (total 31 columns):
patient_age_group    object
patient_state        object
ethinicity           object
household_income     object
education_level      object
visits               int64
cbsa                 float64
num_visits           float64
num_procedures       int64
HX                   float64
CLINIC               float64
INPATIENT            float64
OTHER                float64
OUTPATIENT           float64
UNKNOWN              float64
ATG                  float64
OPR                  float64
OTH                  float64
None                 float64
0001                 float64
0002                 float64
0003                 float64
0004                 float64
0005                 float64
0006                 float64
HX01                 float64
HX02                 float64
HX03                 float64
HX04                 float64
HX05                 float64
HXPR            

In [25]:
test[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,HX,...,0003,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,
103994412,27-29,CA,ALL OTHER,UNKNOWN,UNKNOWN,168,31080,37,417,,...,,,,,,,,,,
318658812,27-29,TN,ALL OTHER,UNKNOWN,UNKNOWN,72,32820,14,150,16.0,...,0.0,0.0,0.0,0.0,6.0,2.0,2.0,0.0,0.0,6.0


## feature counts

In [26]:
diagnosis_cbsa_count_train = pd.read_csv(fdir+'diagnosis_cbsa_count_train.csv.gz')
diagnosis_cbsa_count_train[:3]

Unnamed: 0,patient_id,cbsa,count
0,84548607,18620,1
1,84548607,19100,1
2,84548607,46100,1


In [27]:
train_diagnosis_count = diagnosis_cbsa_count_train.groupby('patient_id')['count'].sum()

In [28]:
train_diagnosis_count.name = 'num_diagnosis'
train_diagnosis_count = pd.DataFrame(train_diagnosis_count)
train_diagnosis_count[:2]

Unnamed: 0_level_0,num_diagnosis
patient_id,Unnamed: 1_level_1
84548607,20
84548626,11


In [29]:
train = pd.merge(train, train_diagnosis_count, left_index=True, right_index=True, how ='left')

In [30]:
train[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR,num_diagnosis
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,,,,,,,,14
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,21
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,,,,,,,,6


In [31]:
diagnosis_cbsa_count_test = pd.read_csv(fdir+'diagnosis_cbsa_count_test.csv.gz')
diagnosis_cbsa_count_test[:3]

Unnamed: 0,patient_id,cbsa,count
0,84548780,21500,1
1,84548780,26180,18
2,84548841,11460,1


In [32]:
test_diagnosis_count = diagnosis_cbsa_count_test[['patient_id','count']].groupby('patient_id')['count'].sum()
test_diagnosis_count.name = 'num_diagnosis'

In [33]:
test['num_diagnosis'] = test_diagnosis_count

In [34]:
test[:2]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,HX,...,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR,num_diagnosis
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,11
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,8


In [35]:
gc.collect()

602

## surgical_count

In [36]:
surgical_procedure_type_code_counts_train = pd.read_csv(fdir+'surgical_procedure_type_code_counts_train.csv.gz')
surgical_procedure_type_code_counts_test = pd.read_csv(fdir+'surgical_procedure_type_code_counts_test.csv.gz')

In [37]:
surgical_procedure_type_code_counts_train[:2]

Unnamed: 0,patient_id,procedure_type_code,proc_type_count
0,84548805,HXPR,2
1,84549017,HXPR,2


In [38]:
train_surgical_count = surgical_procedure_type_code_counts_train[['patient_id','proc_type_count']].groupby('patient_id')['proc_type_count'].sum()

In [39]:
train['num_surgical'] = train_surgical_count

In [40]:
test_surgical_count = surgical_procedure_type_code_counts_test[['patient_id','proc_type_count']].groupby('patient_id')['proc_type_count'].sum()

In [41]:
test['num_surgical'] = test_surgical_count

In [42]:
rx_payment = pd.read_csv(fdir+'rx_payment.csv.gz')
rx_payment[:5]

Unnamed: 0,patient_id,payment
0,84548607,COMMERCIAL
1,84548626,CASH
2,84548626,COMMERCIAL
3,84548666,COMMERCIAL
4,84548780,CASH


In [43]:
rx_count = rx_payment.groupby('patient_id').payment.count()

In [44]:
rx_count[:10]

patient_id
84548607    1
84548626    2
84548666    1
84548780    3
84548805    3
84548821    2
84548841    2
84548915    3
84549017    5
84549024    2
Name: payment, dtype: int64

In [45]:
train['num_rx'] = rx_count
test['num_rx'] = rx_count

In [46]:
rx_pivot = rx_payment.pivot(index='patient_id', columns='payment', values='payment')

In [47]:
rx_pivot.columns = ['RX_ASSISTANCE','RX_CASH','RX_COMMERCIAL','RX_MANAGED_MEDICAID','RX_MEDICAID','RX_MEDICARE']

In [48]:
rx_pivot[:10]

Unnamed: 0_level_0,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84548607,,,COMMERCIAL,,,
84548626,,CASH,COMMERCIAL,,,
84548666,,,COMMERCIAL,,,
84548780,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",,
84548805,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",,
84548821,,CASH,COMMERCIAL,,,
84548841,,CASH,COMMERCIAL,,,
84548915,,,COMMERCIAL,"MANAGED MEDICAID""""",MEDICAID,
84549017,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",MEDICAID,MEDICARE
84549024,,CASH,COMMERCIAL,,,


In [49]:
rx_plans = pd.get_dummies(rx_pivot)

In [50]:
rx_plans.columns = ['RX_ASSISTANCE','RX_CASH','RX_COMMERCIAL','RX_MANAGED_MEDICAID','RX_MEDICAID','RX_MEDICARE']

In [51]:
rx_plans[:10]

Unnamed: 0_level_0,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84548607,0,0,1,0,0,0
84548626,0,1,1,0,0,0
84548666,0,0,1,0,0,0
84548780,0,1,1,1,0,0
84548805,0,1,1,1,0,0
84548821,0,1,1,0,0,0
84548841,0,1,1,0,0,0
84548915,0,0,1,1,1,0
84549017,0,1,1,1,1,1
84549024,0,1,1,0,0,0


In [52]:
train = pd.merge(train, rx_plans, left_index=True, right_index=True, how ='left')

In [53]:
test = pd.merge(test, rx_plans, left_index=True, right_index=True, how ='left')

In [54]:
train[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,HXPR,num_diagnosis,num_surgical,num_rx,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,14,,2,0,1,1,0,0,0
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,2.0,21,4.0,2,0,1,1,0,0,0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,6,,2,0,1,1,0,0,0


In [55]:
train.shape, test.shape

((1157817, 41), (1701813, 40))

In [56]:
train.columns

Index([u'patient_age_group', u'patient_state', u'ethinicity',
       u'household_income', u'education_level', u'is_screener', u'visits',
       u'cbsa', u'num_visits', u'num_procedures', u'HX', u'CLINIC',
       u'INPATIENT', u'OTHER', u'OUTPATIENT', u'UNKNOWN', u'ATG', u'OPR',
       u'OTH', u'None', u'0001', u'0002', u'0003', u'0004', u'0005', u'0006',
       u'HX01', u'HX02', u'HX03', u'HX04', u'HX05', u'HXPR', u'num_diagnosis',
       u'num_surgical', u'num_rx', u'RX_ASSISTANCE', u'RX_CASH',
       u'RX_COMMERCIAL', u'RX_MANAGED_MEDICAID', u'RX_MEDICAID',
       u'RX_MEDICARE'],
      dtype='object')

In [57]:
gc.collect()

629

## pract_screen_pct

In [85]:
patient_pract = pd.read_csv(fdir+'diagnosis_patient_practitioner_train.csv.gz')
patient_pract.set_index('patient_id', inplace=True)
patient_pract[:2]

Unnamed: 0_level_0,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548607,12847096,46340
84548607,12923026,46340


In [86]:
patient_pract.shape

(25364706, 2)

In [59]:
prime_pract = pd.merge(pd.DataFrame(train.is_screener), patient_pract, left_index=True, right_index=True, how='left')
prime_pract.drop(['cbsa'], axis=1, inplace=True)
prime_pract[:3]

Unnamed: 0_level_0,is_screener,primary_practitioner_id
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548607,0,12847096
84548607,0,12923026
84548607,0,12930342


In [60]:
prime_pract.reset_index(inplace=True)
prime_pract.drop('patient_id', axis=1, inplace=True)

In [63]:
prime_pract = prime_pract.sort_values(by='primary_practitioner_id')
prime_pract[:5]

Unnamed: 0,is_screener,primary_practitioner_id
11820982,1,12468727
18167464,1,12468727
24566777,1,12468727
19494255,1,12468727
19802299,1,12469219


In [64]:
pract_g = prime_pract.groupby('primary_practitioner_id')
ppp = pd.DataFrame(pract_g.is_screener.mean())  # primary practioner percentage
ppp.iloc[:5]

Unnamed: 0_level_0,is_screener
primary_practitioner_id,Unnamed: 1_level_1
12468727,1.0
12469219,0.333333
12469795,1.0
12470070,1.0
12470221,1.0


In [65]:
patient_prime = pd.merge(patient_pract, ppp, left_on='primary_practitioner_id', right_index=True, how='left')
patient_prime.iloc[:5]

Unnamed: 0_level_0,primary_practitioner_id,cbsa,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548607,12847096,46340,0.494
84548607,12923026,46340,0.534884
84548607,12930342,46100,0.437838
84548607,12993738,46340,0.463895
84548607,13001412,46340,0.453581


In [67]:
patient_prime_screen_pct = pd.DataFrame(patient_prime.groupby(level=0).is_screener.max())
patient_prime_screen_pct.columns=['pract_screen_pct']
patient_prime_screen_pct.iloc[:5]

Unnamed: 0_level_0,pract_screen_pct
patient_id,Unnamed: 1_level_1
84548607,0.647059
84548626,0.965714
84548666,0.877778
84548805,1.0
84548821,0.97426


In [72]:
train['pract_screen_pct'] = patient_prime_screen_pct.pract_screen_pct

In [68]:
test_patient_pract = pd.read_csv(fdir+'diagnosis_patient_practitioner_test.csv.gz')
test_patient_pract.set_index('patient_id', inplace=True)
test_patient_pract[:2]

Unnamed: 0_level_0,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548780,12755099,26180
84548780,14102147,26180


In [69]:
test_patient_prime = pd.merge(test_patient_pract, ppp, left_on='primary_practitioner_id', right_index=True, how='left')
test_patient_prime.iloc[:5]

Unnamed: 0_level_0,primary_practitioner_id,cbsa,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548780,12755099,26180,0.476636
84548780,14102147,26180,0.490196
84548780,14142454,26180,0.934579
84548780,14357789,26180,0.465517
84548780,16955346,26180,0.503876


In [138]:
test_patient_prime_screen_pct = pd.DataFrame(test_patient_prime.groupby(level=0).is_screener.max())
test_patient_prime_screen_pct.columns=['pract_screen_pct']
test_patient_prime_screen_pct.iloc[:5]

Unnamed: 0_level_0,pract_screen_pct
patient_id,Unnamed: 1_level_1
84548780,1.0
84548841,1.0
84548915,1.0
84549024,0.75
84549114,0.888889


In [139]:
test['pract_screen_pct'] = test_patient_prime_screen_pct.pract_screen_pct

## cbsa_screen_pct

In [94]:
patient_cbsas = pd.merge(pd.DataFrame(train.is_screener), patient_pract, left_index=True, right_index=True, how='left')
patient_cbsas[:3]

Unnamed: 0_level_0,is_screener,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548607,0,12847096,46340
84548607,0,12923026,46340
84548607,0,12930342,46100


In [95]:
patient_cbsas.reset_index(inplace=True)
patient_cbsas.drop(['patient_id','primary_practitioner_id'], axis=1, inplace=True)
patient_cbsas[:3]

Unnamed: 0,is_screener,cbsa
0,0,46340
1,0,46340
2,0,46100


In [96]:
cbsa_g = patient_cbsas.groupby('cbsa')
cbsa_pct = pd.DataFrame(cbsa_g.is_screener.mean())  # cbsa percentage screened at that location
cbsa_pct.iloc[:5]

Unnamed: 0_level_0,is_screener
cbsa,Unnamed: 1_level_1
10100,0.59273
10140,0.335737
10180,0.390896
10220,0.571952
10260,0.369565


In [99]:
cbsa_pct.columns = ['cbsa_pct']

In [102]:
train = pd.merge(train, cbsa_pct, left_on='cbsa', right_index=True, how='left')

In [103]:
test = pd.merge(test, cbsa_pct, left_on='cbsa', right_index=True, how='left')

## age_pct

In [177]:
# age_pct = train[['patient_age_group','is_screener']].groupby('patient_age_group').is_screener.mean()
# age_pct

In [181]:
age_pct_file = fdir+'age_pct.csv'

In [190]:
train = fjoin(train, age_pct_file, 'patient_age_group')

In [195]:
test = fjoin(test, age_pct_file, 'patient_age_group')

##  diagnosis_code features

In [None]:
# from sqlalchemy import create_engine
# engine = create_engine('postgresql://paulperry:@localhost:5432/ccancer') 

In [None]:
# q1 = "select t1.patient_id, diagnosis_code from diagnosis t1 \
#     right join patients_train t2 on (t1.patient_id=t2.patient_id) where diagnosis_code in ('632','650')"

In [None]:
# diagf = pd.read_sql_query(q1, engine)

In [220]:
train_key_diagnosis = pd.read_csv(fdir+'train_key_diagnosis.csv.gz')
train_key_d = pd.crosstab(train_key_diagnosis.patient_id,train_key_diagnosis.diagnosis_code) 
train_key_d[:5]

diagnosis_code,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84552398,0,1
84553164,1,0
84553713,0,1
84554799,2,2
84557447,1,0


In [223]:
# spot check this
# train_key_diagnosis[train_key_diagnosis.patient_id == 84554799]

In [228]:
# set dummies
train_key_d[train_key_d > 0] = 1

In [232]:
train = pd.merge(train, train_key_d, left_index=True, right_index=True, how='left')
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE,pract_screen_pct,cbsa_pct,age_pct,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,1,1,0,0,0,0.851852,0.603501,0.538345,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,1,1,0,0,0,1.0,0.705413,0.624289,1.0,0.0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,1,1,0,0,0,0.777778,0.442985,0.718529,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,0,0,1,0,0,1.0,0.730394,0.703938,1.0,1.0


In [233]:
test_key_diagnosis = pd.read_csv(fdir+'test_key_diagnosis.csv.gz')
test_key_d = pd.crosstab(test_key_diagnosis.patient_id,test_key_diagnosis.diagnosis_code) 
test_key_d[:5]

diagnosis_code,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548780,0,1
84549024,0,4
84549728,0,2
84549953,0,4
84550351,0,3


In [234]:
# set dummies
test_key_d[test_key_d > 0] = 1

In [235]:
test = pd.merge(test, test_key_d, left_index=True, right_index=True, how='left')

## procedure_code features

In [238]:
train_key_procedure = pd.read_csv(fdir+'train_key_procedure.csv.gz')
train_key_p = pd.crosstab(train_key_procedure.patient_id,train_key_procedure.procedure_code) 
train_key_p[:2]

procedure_code,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
84553111,0,0,0,1,0,0,0,0,0
84553567,1,0,0,0,0,0,0,0,0


In [239]:
# set dummies
train_key_p[train_key_p > 0] = 1

In [241]:
train = pd.merge(train, train_key_p, left_index=True, right_index=True, how='left')
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,,,,,,,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,,,,,,,,,
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,,,,,,,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [242]:
test_key_procedure = pd.read_csv(fdir+'test_key_procedure.csv.gz')
test_key_p = pd.crosstab(test_key_procedure.patient_id,test_key_procedure.procedure_code) 
test_key_p[:2]

procedure_code,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
84549453,0,1,0,0,0,0,0,0,0
84549821,0,1,0,0,0,0,0,0,0


In [239]:
# set dummies
test_key_p[test_key_p > 0] = 1

In [243]:
test = pd.merge(test, test_key_p, left_index=True, right_index=True, how='left')
test[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,HX,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,
103994412,27-29,CA,ALL OTHER,UNKNOWN,UNKNOWN,168,31080,37,417,,...,,,,,,,,,,
318658812,27-29,TN,ALL OTHER,UNKNOWN,UNKNOWN,72,32820,14,150,16.0,...,,,,,,,,,,


## diagnosis HPV

In [261]:
diagnosis_hpv = pd.read_csv('./features/diagnosis_hpv.csv.gz')
diagnosis_hpv.set_index('patient_id', inplace=True)
diagnosis_hpv.columns = ['diagnosis_hpv']
diagnosis_hpv[:2]

Unnamed: 0_level_0,diagnosis_hpv
patient_id,Unnamed: 1_level_1
100261044,795
184622197,795


In [262]:
procedure_hpv = pd.read_csv('./features/procedure_hpv.csv.gz')
procedure_hpv.set_index('patient_id', inplace=True)
procedure_hpv.columns = ['procedure_hpv']
procedure_hpv[:2]

Unnamed: 0_level_0,procedure_hpv
patient_id,Unnamed: 1_level_1
538514019,90649
247804914,90649


In [263]:
surgical_pap = pd.read_csv('./features/surgical_pap.csv.gz')
surgical_pap.set_index('patient_id', inplace=True)
surgical_pap.columns = ['surgical_pap']
surgical_pap[:2]

Unnamed: 0_level_0,surgical_pap
patient_id,Unnamed: 1_level_1
372429831,9146
114564458,9146


In [264]:
train = pd.merge(train, diagnosis_hpv, left_index=True, right_index=True, how='left')
train = pd.merge(train, procedure_hpv, left_index=True, right_index=True, how='left')
train = pd.merge(train, surgical_pap, left_index=True, right_index=True, how='left')

In [265]:
train[:5]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,57455,57456,81252,90696,G0143,S4020,S4023,diagnosis_hpv,procedure_hpv,surgical_pap
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84548607,63-65,TX,CAUCASIAN,<=$49K,SOME COLLEGE,0,10,46340,17,263,...,,,,,,,,,,
84548626,45-47,PA,CAUCASIAN,$100K+,HIGH SCHOOL OR LESS,1,138,23900,7,80,...,,,,,,,,,,
84548666,57-59,CA,CAUCASIAN,$100K+,SOME COLLEGE,1,138,21700,3,73,...,,,,,,,,,,
84548805,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,1,134,31080,19,235,...,,,,,,,,,90649.0,
84548805,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,1,134,31080,19,235,...,,,,,,,,,90649.0,


In [267]:
test = pd.merge(test, diagnosis_hpv, left_index=True, right_index=True, how='left')
test = pd.merge(test, procedure_hpv, left_index=True, right_index=True, how='left')
test = pd.merge(test, surgical_pap, left_index=True, right_index=True, how='left')

## Output

In [275]:
train.columns

Index([  u'patient_age_group',       u'patient_state',          u'ethinicity',
          u'household_income',     u'education_level',         u'is_screener',
                    u'visits',                u'cbsa',          u'num_visits',
            u'num_procedures',                  u'HX',              u'CLINIC',
                 u'INPATIENT',               u'OTHER',          u'OUTPATIENT',
                   u'UNKNOWN',                 u'ATG',                 u'OPR',
                       u'OTH',                u'None',                u'0001',
                      u'0002',                u'0003',                u'0004',
                      u'0005',                u'0006',                u'HX01',
                      u'HX02',                u'HX03',                u'HX04',
                      u'HX05',                u'HXPR',       u'num_diagnosis',
              u'num_surgical',              u'num_rx',       u'RX_ASSISTANCE',
                   u'RX_CASH',       u'RX_COMMERCIAL

In [276]:
train.to_csv('./features/train_big_table.csv')

In [277]:
test.to_csv('./features/test_big_table.csv')

In [278]:
train_encoded = train.copy()

In [279]:
# patient_age encode
patient_age_dict = \
{
'24-26': 1,
'27-29': 2,
'30-32': 3,
'36-38': 4,
'39-41': 5,
'42-44': 6,
'45-47': 7,
'48-50': 8,
'33-35': 9,
'51-53': 10,
'54-56': 11,
'57-59': 12,
'60-62': 13,
'63-65': 14,
'66-68': 15,
'69-71': 16
}

train_encoded.patient_age_group  = [ patient_age_dict[i]  for i in train_encoded.patient_age_group.values ]

In [280]:
household_income_dict = {'UNKNOWN': 0,  '<=$49K': 1, '<$50-99K': 2, '$100K+': 3}
train_encoded.household_income  = [ household_income_dict[i]  for i in train_encoded.household_income.values ]

In [281]:
from sklearn.preprocessing import LabelEncoder
le_patient_state = LabelEncoder().fit(train.patient_state.values)
le_ethinicity    = LabelEncoder().fit(train.ethinicity.values)
le_education_level = LabelEncoder().fit(train.education_level.values)
train_encoded.patient_state = le_patient_state.transform(train.patient_state.values)
train_encoded.ethinicity    = le_ethinicity.transform(train.ethinicity.values)
train_encoded.education_level = le_education_level.transform(train.education_level.values)

In [282]:
train_encoded[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,57455,57456,81252,90696,G0143,S4020,S4023,diagnosis_hpv,procedure_hpv,surgical_pap
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84548607,14,43,2,1,2,0,10,46340,17,263,...,,,,,,,,,,
84548626,7,38,2,3,1,1,138,23900,7,80,...,,,,,,,,,,
84548666,12,4,2,3,2,1,138,21700,3,73,...,,,,,,,,,,


In [283]:
train_encoded.to_csv('./features/train_big_table_encoded.csv')

In [284]:
test_encoded = test.copy()

In [285]:
test_encoded.patient_age_group  = [ patient_age_dict[i]  for i in test_encoded.patient_age_group.values ]
test_encoded.household_income  = [ household_income_dict[i]  for i in test_encoded.household_income.values ]

In [286]:
test_encoded.patient_state = le_patient_state.transform(test.patient_state.values)
test_encoded.ethinicity    = le_ethinicity.transform(test.ethinicity.values)
test_encoded.education_level = le_education_level.transform(test.education_level.values)

In [287]:
test_encoded[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,HX,...,57455,57456,81252,90696,G0143,S4020,S4023,diagnosis_hpv,procedure_hpv,surgical_pap
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84548780,8,11,3,1,1,124,26180,18,174,,...,,,,,,,,,,
84548841,11,48,2,3,2,166,20740,16,82,,...,,,,,,,,,,
84548915,7,35,2,1,1,148,49660,32,571,14.0,...,,,,,,,,,,


In [288]:
test_encoded.to_csv('./features/test_big_table_encoded.csv')

In [289]:
gc.collect()

678

# DONE !!!