# Identification of doctors as early prescribers

Determine whether early adopters are associated with greater prescription usage of drugs

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

### Read in Imbruvica Data

In [78]:
npi = pd.read_csv('data/PartD_Prescriber_PUF_NPI_16_100.txt', delimiter='\t')

imbruvica = pd.read_csv('data/imbruvica_prescrib_npi_summary.csv', delimiter='\t')
duplicate_cols = ['npi.1', 
                  'nppes_provider_last_org_name.1', 
                  'nppes_provider_first_name.1', 
                  'nppes_provider_city.1', 
                  'nppes_provider_state.1', 
                  'specialty_description.1', 
                  'description_flag.1', 
                  'total_claim_count.1',
                  'total_30_day_fill_count.1',
                  'total_drug_cost.1', 
                  'total_day_supply.1', 
                  'bene_count.1', 
                  'ge65_suppress_flag.1', 
                  'total_claim_count_ge65.1', 
                  'total_30_day_fill_count_ge65.1', 
                  'total_drug_cost_ge65.1', 
                  'total_day_supply_ge65.1', 
                  'bene_count_ge65_suppress_flag.1', 
                  'bene_count_ge65.1']
imbruvica.drop(duplicate_cols, axis=1, inplace=True)
imbruvica.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_city,nppes_provider_state,specialty_description,description_flag,drug_name,generic_name,bene_count,...,beneficiary_male_count,beneficiary_race_white_count,beneficiary_race_black_count,beneficiary_race_asian_pi_count,beneficiary_race_hispanic_count,beneficiary_race_nat_ind_count,beneficiary_race_other_count,beneficiary_nondual_count,beneficiary_dual_count,beneficiary_average_risk_score
0,1033101068,MARQUES BIBILONI,JOSE,SANTURCE,PR,Hematology-Oncology,S,IMBRUVICA,IBRUTINIB,,...,38.0,0.0,,0.0,,0.0,0.0,82.0,0.0,2.3657
1,1114922432,OYOLA,RAUL,MARIETTA,GA,Hematology-Oncology,S,IMBRUVICA,IBRUTINIB,,...,89.0,196.0,38.0,,,0.0,,215.0,37.0,1.9098
2,1073841938,BRAY,LISA,MILWAUKEE,WI,Nurse Practitioner,S,IMBRUVICA,IBRUTINIB,,...,57.0,,,,,0.0,,134.0,21.0,2.0907
3,1194784207,WEINSHEL,ERIC,EDINA,MN,Medical Oncology,S,IMBRUVICA,IBRUTINIB,,...,67.0,,,,0.0,,,168.0,12.0,1.9519
4,1134116262,TAI,MUHAMMAD,REXFORD,NY,Medical Oncology,S,IMBRUVICA,IBRUTINIB,,...,80.0,206.0,,,,0.0,,172.0,60.0,1.9718


In [58]:
# Drop all columns expect for npi and total_claim_count
drop_cols = imbruvica.columns[imbruvica.columns.isin(['npi', 'total_claim_count']) == False]
imbruvica.drop(drop_cols, axis=1, inplace=True)

imbruvica.head()

Unnamed: 0,npi,total_claim_count
0,1033101068,12
1,1114922432,11
2,1073841938,26
3,1194784207,11
4,1134116262,27


In [59]:
# Create column that determines physician is a high-prescriber of Imbruvica
imbruvica['high_prescrib'] = 0
imbruvica.loc[imbruvica['total_claim_count'] >= imbruvica['total_claim_count'].quantile(0.75), 'high_prescrib'] = 1
imbruvica.head()

Unnamed: 0,npi,total_claim_count,high_prescrib
0,1033101068,12,0
1,1114922432,11,0
2,1073841938,26,0
3,1194784207,11,0
4,1134116262,27,0


#### Identify:
    - early adopter of drug in first year?
    - high prescriber of drug in first year?
    - high prescriber of drug in in years 1-3?

In [62]:
# Get column names for partb
partb_cols = pd.read_csv('data/CMS_PartB_Provider_Util_Payment_CY2016_100.txt', delimiter='\t')
partb_cols = partb_cols.columns

# Get column names for partd
partd_cols = pd.read_csv('data/CMS_PartD_Prescriber_NPI_Drug_CY2016_100.txt', delimiter='\t')
partd_cols = partd_cols.columns

In [89]:
xalkori = pd.read_csv('data/partd_xalkori_14.txt', delimiter='\t')
xalkori.columns = partd_cols
gilotrif = pd.read_csv('data/partd_gilotrif_14.txt', delimiter='\t')
gilotrif.columns = partd_cols
mekinist = pd.read_csv('data/partd_mekinist_14.txt', delimiter='\t')
mekinist.columns = partd_cols
zykadia = pd.read_csv('data/partd_zykadia_15.txt', delimiter='\t')
zykadia.columns = partd_cols
imbruvica_14 = pd.read_csv('data/partd_imbruvica_14.txt', delimiter='\t')
imbruvica_14.columns = partd_cols

nivo = pd.read_csv('data/partb_nivolumab_16.txt', delimiter='\t')
nivo.columns = partb_cols
nivo.rename(columns={'NPI': 'npi'}, inplace=True)

xalkori.head(1)

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_city,nppes_provider_state,specialty_description,description_flag,drug_name,generic_name,bene_count,...,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65
0,1588774624,ADLER,BRIAN,BIRMINGHAM,AL,Medical Oncology,S,XALKORI,CRIZOTINIB,,...,12.0,360,144373.16,,*,12.0,,12.0,360.0,144373.16


In [98]:
zykadia.columns

Index(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_city', 'nppes_provider_state', 'specialty_description',
       'description_flag', 'drug_name', 'generic_name', 'bene_count',
       'total_claim_count', 'total_30_day_fill_count', 'total_day_supply',
       'total_drug_cost', 'bene_count_ge65', 'bene_count_ge65_suppress_flag',
       'total_claim_count_ge65', 'ge65_suppress_flag',
       'total_30_day_fill_count_ge65', 'total_day_supply_ge65',
       'total_drug_cost_ge65'],
      dtype='object')

In [95]:
# CLEAN DATA of null values
# Part D
dfs_to_clean = [xalkori, mekinist, gilotrif, mekinist, zykadia, imbruvica_14]
col = 'total_claim_count'
for df in dfs_to_clean:
    mask = df[col].isnull()
    df.loc[mask, col] = df.loc[mask, col].apply(lambda v: np.random.choice(range(1,11)))

# Part B
dfs_to_clean = [nivo]
col = 'LINE_SRVC_CNT'
for df in dfs_to_clean:
    mask = df[col].isnull()
    df.loc[mask, col] = df.loc[mask, col].apply(lambda v: np.random.choice(range(1,11)))

In [104]:
# Create features that identify early adopters (prescribed at all) and high presribers (in 25th percentile)
add_cols_EA = [('xalkori_EA', xalkori), ('gilotrif_EA', gilotrif), ('mekinist_EA', mekinist), ('zykadia_EA', zykadia), ('imbruivca_EA', imbruvica_14), ('nivo_EA', nivo)]
add_cols_HP_d = [('xalkori_HP', xalkori), ('gilotrif_HP', gilotrif), ('mekinist_HP', mekinist), ('zykadia_HP', zykadia)]
add_cols_HP_d2 = [('xalkori_HP2', xalkori), ('gilotrif_HP2', gilotrif), ('mekinist_HP2', mekinist), ('zykadia_HP2', zykadia)]
add_cols_HP_b = [('nivo_HP', nivo)]

for col in add_cols_EA:
    imbruvica[col[0]] = 0
    imbruvica.loc[imbruvica['npi'].isin(col[1]['npi'].values), col[0]] = 1
    
for col in add_cols_HP_d:
    imbruvica[col[0]] = 0
    HP_NPIs = col[1].loc[col[1]['total_claim_count'] >= col[1]['total_claim_count'].quantile(0.75), 'npi'].values
    imbruvica.loc[imbruvica['npi'].isin(HP_NPIs), col[0]] = 1

for col in add_cols_HP_d2:
    imbruvica[col[0]] = 0
    HP_NPIs = col[1].loc[col[1]['total_drug_cost'] >= col[1]['total_drug_cost'].quantile(0.75), 'npi'].values
    imbruvica.loc[imbruvica['npi'].isin(HP_NPIs), col[0]] = 1  
  
for col in add_cols_HP_b:
    imbruvica[col[0]] = 0
    HP_NPIs = col[1].loc[col[1]['LINE_SRVC_CNT'] >= col[1]['LINE_SRVC_CNT'].quantile(0.75), 'npi'].values
    imbruvica.loc[imbruvica['npi'].isin(HP_NPIs), col[0]] = 1
    
imbruvica.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_city,nppes_provider_state,specialty_description,description_flag,drug_name,generic_name,bene_count,...,nivo_EA,xalkori_HP,gilotrif_HP,mekinist_HP,zykadia_HP,nivo_HP,xalkori_HP2,gilotrif_HP2,mekinist_HP2,zykadia_HP2
0,1033101068,MARQUES BIBILONI,JOSE,SANTURCE,PR,Hematology-Oncology,S,IMBRUVICA,IBRUTINIB,,...,0,0,0,0,0,0,0,0,0,0
1,1114922432,OYOLA,RAUL,MARIETTA,GA,Hematology-Oncology,S,IMBRUVICA,IBRUTINIB,,...,0,0,0,0,0,0,0,0,0,0
2,1073841938,BRAY,LISA,MILWAUKEE,WI,Nurse Practitioner,S,IMBRUVICA,IBRUTINIB,,...,0,0,0,0,0,0,0,0,0,0
3,1194784207,WEINSHEL,ERIC,EDINA,MN,Medical Oncology,S,IMBRUVICA,IBRUTINIB,,...,0,0,0,0,0,0,0,0,0,0
4,1134116262,TAI,MUHAMMAD,REXFORD,NY,Medical Oncology,S,IMBRUVICA,IBRUTINIB,,...,0,0,0,0,0,0,0,0,0,0


In [105]:
#imbruvica.drop('total_claim_count', axis=1, inplace=True)
imbruvica.to_csv('data/imbruvica_EA_HP.txt', sep='\t')