# CoverMyMeds bootcamp project

## Import the things

In [1]:
# import the packages we'll use
## For data handling
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

## For DB management (if needed)
import sqlite3

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

## For model fitting
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

## Read in CSV files

In [2]:
bridge = pd.read_csv("./CoverMyMeds_data/bridge.csv")
dim_claims = pd.read_csv("./CoverMyMeds_data/dim_claims.csv")
dim_date = pd.read_csv("./CoverMyMeds_data/dim_date.csv")
dim_pa = pd.read_csv("./CoverMyMeds_data/dim_pa.csv")

In [3]:
df2 = bridge.merge(dim_claims, on='dim_claim_id', how='left').copy()
df4 = df2.merge(dim_pa, on='dim_pa_id', how='left').copy()
mega_df = df4.merge(dim_date, on='dim_date_id', how='left').copy()
## Replace all NaN w/ -99
mega_df.fillna(-99,inplace=True)

In [4]:
## Check the datasets (index start at 0), [OPTIONAL]
## Size of claims = 1335576 entries
## Size of date = 1520 entries
## Size of PA = 555951
## Size of bridge = 1335576
mega_df

Unnamed: 0,dim_claim_id,dim_pa_id,dim_date_id,bin,drug,reject_code,pharmacy_claim_approved,correct_diagnosis,tried_and_failed,contraindication,pa_approved,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday
0,1,1.0,1,417380,A,75.0,0,1.0,1.0,0.0,1.0,2017-01-01,2017,1,1,1,0,0,1
1,2,-99.0,1,999001,A,-99.0,1,-99.0,-99.0,-99.0,-99.0,2017-01-01,2017,1,1,1,0,0,1
2,3,2.0,1,417740,A,76.0,0,1.0,0.0,0.0,1.0,2017-01-01,2017,1,1,1,0,0,1
3,4,-99.0,1,999001,A,-99.0,1,-99.0,-99.0,-99.0,-99.0,2017-01-01,2017,1,1,1,0,0,1
4,5,-99.0,1,417740,A,-99.0,1,-99.0,-99.0,-99.0,-99.0,2017-01-01,2017,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335571,1335572,555950.0,1095,417740,C,75.0,0,1.0,0.0,0.0,1.0,2019-12-31,2019,12,31,3,1,1,0
1335572,1335573,-99.0,1095,999001,C,-99.0,1,-99.0,-99.0,-99.0,-99.0,2019-12-31,2019,12,31,3,1,1,0
1335573,1335574,555951.0,1095,417380,C,70.0,0,0.0,0.0,1.0,0.0,2019-12-31,2019,12,31,3,1,1,0
1335574,1335575,-99.0,1095,999001,C,-99.0,1,-99.0,-99.0,-99.0,-99.0,2019-12-31,2019,12,31,3,1,1,0


## Intro Analysis

In [7]:
## Number of Claims by each Payer

#print("BIN\t#Claims")
#print(mega_df.bin.value_counts())
#print("BIN\t%Claims")
#print(round(mega_df.bin.value_counts(normalize=True)*100, 2))

m = mega_df.bin.value_counts()
n = round(mega_df.bin.value_counts(normalize=True)*100, 2)
payer_claims = [[i,m[i],n[i]] for i in m.index.values]
payer_claims_df = pd.DataFrame(data=payer_claims,columns=['Payer','#Claims','%Claims'])
payer_claims_df

Unnamed: 0,Payer,#Claims,%Claims
0,999001,640740,47.97
1,417614,307323,23.01
2,417740,213982,16.02
3,417380,173531,12.99


In [8]:
## Number of Claims by each Drug

#print("Drug  #Claims")
#print(mega_df.drug.value_counts())
#print("Drug  %Claims")
#print(round(mega_df.drug.value_counts(normalize=True)*100, 2))

m = mega_df.drug.value_counts()
n = round(mega_df.drug.value_counts(normalize=True)*100, 2)
drug_claims = [[i,m[i],n[i]] for i in m.index.values]
drug_claims_df = pd.DataFrame(data=drug_claims,columns=['Drug','#Claims','%Claims'])
drug_claims_df

Unnamed: 0,Drug,#Claims,%Claims
0,A,679283,50.86
1,B,342750,25.66
2,C,313543,23.48


In [12]:
## Frequency of Reject Codes

#print(mega_df.reject_code.value_counts())
#print(round(mega_df.reject_code.value_counts(normalize=True)*100, 2))

m = mega_df.reject_code.value_counts()
n = round(mega_df.reject_code.value_counts(normalize=True)*100, 2)
reject_claims = [[i,m[i],n[i]] for i in m.index.values]
reject_claims_df = pd.DataFrame(data=reject_claims,columns=['Reject_Code','#Claims','%Claims'])
reject_claims_df

Unnamed: 0,Reject_Code,#Claims,%Claims
0,-99.0,779625,58.37
1,70.0,252206,18.88
2,75.0,217351,16.27
3,76.0,86394,6.47


In [13]:
## Data split by Individual Payer
payers = sorted(list(set(mega_df['bin'])))
payer1 = mega_df[mega_df.bin == payers[0]]
payer2 = mega_df[mega_df.bin == payers[1]]
payer3 = mega_df[mega_df.bin == payers[2]]
payer4 = mega_df[mega_df.bin == payers[3]]

## Payer's Claims numbers by drug
print(round(payer1.drug.value_counts(normalize=True)*100, 2))
print(round(payer2.drug.value_counts(normalize=True)*100, 2))
print(round(payer3.drug.value_counts(normalize=True)*100, 2))
print(round(payer4.drug.value_counts(normalize=True)*100, 2))



A    50.99
B    25.57
C    23.44
Name: drug, dtype: float64
A    50.92
B    25.60
C    23.47
Name: drug, dtype: float64
A    50.83
B    25.72
C    23.45
Name: drug, dtype: float64
A    50.81
B    25.70
C    23.50
Name: drug, dtype: float64


In [None]:
## Data split by Drug
drugs = sorted(list(set(mega_df['drug'])))
drug1 = mega_df[mega_df.drug == drugs[0]]
drug2 = mega_df[mega_df.drug == drugs[1]]
drug3 = mega_df[mega_df.drug == drugs[2]]

## Percentage of Claims for a Drug by Payer
print(round(drug1.bin.value_counts(normalize=True)*100, 2))
print(round(drug2.bin.value_counts(normalize=True)*100, 2))
print(round(drug3.bin.value_counts(normalize=True)*100, 2))

In [None]:
p1d1 = payer1[payer1.drug == drugs[0]]

In [None]:
## Data split by Payer, by Drug
p1d1 = payer1[payer1.drug == drugs[0]]
p1d2 = payer1[payer1.drug == drugs[1]]
p1d3 = payer1[payer1.drug == drugs[2]]

p2d1 = payer2[payer2.drug == drugs[0]]
p2d2 = payer2[payer2.drug == drugs[1]]
p2d3 = payer2[payer2.drug == drugs[2]]

p3d1 = payer3[payer3.drug == drugs[0]]
p3d2 = payer3[payer3.drug == drugs[1]]
p3d3 = payer3[payer3.drug == drugs[2]]

p4d1 = payer4[payer4.drug == drugs[0]]
p4d2 = payer4[payer4.drug == drugs[1]]
p4d3 = payer4[payer4.drug == drugs[2]]

## Percentage of Reject Codes for a Drug by Payer
print(round(p1d1.reject_code.value_counts(normalize=True)*100, 2))
print(round(p1d2.reject_code.value_counts(normalize=True)*100, 2))
print(round(p1d3.reject_code.value_counts(normalize=True)*100, 2))

print(round(p2d1.reject_code.value_counts(normalize=True)*100, 2))
print(round(p2d2.reject_code.value_counts(normalize=True)*100, 2))
print(round(p2d3.reject_code.value_counts(normalize=True)*100, 2))

print(round(p3d1.reject_code.value_counts(normalize=True)*100, 2))
print(round(p3d2.reject_code.value_counts(normalize=True)*100, 2))
print(round(p3d3.reject_code.value_counts(normalize=True)*100, 2))

print(round(p4d1.reject_code.value_counts(normalize=True)*100, 2))
print(round(p4d2.reject_code.value_counts(normalize=True)*100, 2))
print(round(p4d3.reject_code.value_counts(normalize=True)*100, 2))

In [None]:
m = round(p1d2.reject_code.value_counts(normalize=True)*100, 2)
m[m.index.values[0]]

In [None]:
## Check sorting of reject_code
sorted(list(set(mega_df['reject_code'])))

In [None]:
## Data split by Reject Code
## first code in sorted list is NaN == -99
codes = sorted(list(set(mega_df['reject_code'])))
code0 = mega_df[mega_df.reject_code == codes[0]]
code1 = mega_df[mega_df.reject_code == codes[1]]
code2 = mega_df[mega_df.reject_code == codes[2]]
code3 = mega_df[mega_df.reject_code == codes[3]]

## Percentage of Claims by Reject Code for each Payer
print(round(code0.bin.value_counts(normalize=True)*100, 2))
print(round(code1.bin.value_counts(normalize=True)*100, 2))
print(round(code2.bin.value_counts(normalize=True)*100, 2))
print(round(code3.bin.value_counts(normalize=True)*100, 2))

In [None]:
## Data split by reject code, from block above

## Percent of Reject Code approved after PA
print(round(code0.pa_approved.value_counts(normalize=True)*100, 2))
print(round(code1.pa_approved.value_counts(normalize=True)*100, 2))
print(round(code2.pa_approved.value_counts(normalize=True)*100, 2))
print(round(code3.pa_approved.value_counts(normalize=True)*100, 2))

In [None]:
c0p1 = code0[code0.bin == payers[0]]
c0p2 = code0[code0.bin == payers[1]]
c0p3 = code0[code0.bin == payers[2]]
c0p4 = code0[code0.bin == payers[3]]
print(round(c0p1.drug.value_counts(normalize=True)*100, 2))
print(round(c0p2.drug.value_counts(normalize=True)*100, 2))
print(round(c0p3.drug.value_counts(normalize=True)*100, 2))
print(round(c0p4.drug.value_counts(normalize=True)*100, 2))