In [None]:
import pandas as pd
import requests
from tabula import read_pdf
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
df = pd.read_csv('Sales.csv')

In [None]:
df.head()

In [None]:
df.columns = ['id', 'date','zip','payment_usd','hospital_id','hospital_name']

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['date'].head()

In [None]:
df['yr'] = pd.DatetimeIndex(df['date']).year 
df['month'] = pd.DatetimeIndex(df['date']).month

In [None]:
df['year_month'] = df['yr'].astype(str) + '-' + df['month'].astype(str)

In [None]:
df.head()

In [None]:
df.hospital_name.unique()

## Total Sales

In [None]:
total_sales = df['payment_usd'].sum()

In [None]:
total_sales

In [None]:
iplot([go.Bar(x=[total_sales])])

## Total Sales by Month

In [None]:
df.groupby(['yr', 'month'])['payment_usd'].sum()

In [None]:
iplot([go.Bar(x=[total_sales])])

## Total Sales by Zip Code

In [None]:
df.groupby(['zip'])['payment_usd'].sum()

## Breakdown of sales by teaching hospital vs non-teaching hospital

In [None]:
teach_hosp_url ='https://www.cms.gov/OpenPayments/Downloads/2018-Reporting-Cycle-Teaching-Hospital-List-pdf.pdf'

In [None]:
r = requests.get(teach_hosp_url)

Download a pdf of the teaching hospitals as of Oct 2018

In [None]:
with open('teaching_hospitals.pdf', 'wb') as f:
    f.write(r.content)

In [None]:
df_teach_hosp = read_pdf('teaching_hospitals.pdf', pages='all', pandas_options={'header': None})

Massage into single data frame:  
Note: the hospital id fields did not match those in the above pdf so had to use the name and zip as our join fields

The PDF Scraper had inconsistent column counts  
We want the name column and zip code columns   
9 cols, name=1, zip=8  
12 cols, name=2, zip=7  
13 cols, name =2, zip=8  
14 cols, name=2, zip=8  

In [None]:
parse_dict = {9:(1,8), 12:(2,7), 13:(2,8), 14:(2, 8)}

In [None]:
name_list = []
zip_list = []
for df_page in df_teach_hosp:
    num_cols = len(df_page.columns)
    name_col, zip_col = parse_dict[num_cols]
    name_list.append(df_page.iloc[:, name_col].tolist())
    zip_list.append(df_page.iloc[:, zip_col].tolist())

In [None]:
def list_flatten(input_list):
    # Flatten List of lists
    flat_list = []
    for sublist in input_list:
        for item in sublist:
            flat_list.append(item)
    return flat_list

In [None]:
# first two entries are nan and header
flat_name_list = list_flatten(name_list)[2:]
flat_zipcode_list = list_flatten(zip_list)[2:]

In [None]:
df_teaching_hosp = pd.DataFrame(flat_name_list, flat_zipcode_list)

In [None]:
df_teaching_hosp.columns = ['teaching_hosp']

In [None]:
df_teaching_hosp['zip_code'] = df_teaching_hosp.index

In [None]:
df_teaching_hosp.head()

In [None]:
# need zip to be str type for join later
df['zip'] = df['zip'].astype('str')

In [None]:
df_teaching_hosp.head()

In [None]:
# need zip to be str type for join later
df_teaching_hosp['zip_code'] = df_teaching_hosp['zip_code'].astype('str')

In [None]:
# Left join on zipcode and hospital name since hospital ID doesn't seem to match our 3rd party data source
joined_df = pd.merge(df, df_teaching_hosp,  how='left', left_on=['hospital_name','zip'], right_on = ['teaching_hosp','zip_code'])

In [None]:
# if we have a label in the joined col then teaching hospital, if na then not teaching hospital
joined_df['category_teaching'] = ~joined_df['teaching_hosp'].isna()

In [None]:
joined_df.head()

In [None]:
joined_df.groupby(['category_teaching'])['payment_usd'].sum()

1: Total sales  
2: Total sales by month  
3: Total sales per zip code  
4: Breakdown of sales by teaching hospital vs non-teaching hospital  

In [None]:
#df.group_by([''])