In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read in the file where all years are combined 2015-2017

In [None]:
df_payments_combined=pd.read_pickle('../data/combined_years_payments.pkl')

In [None]:
df_payments_combined.head()

In [None]:
print(df_payments_combined.shape)

# Add a category column to the dataframe according to the condition:
    -when Entity=I&POS= F then "Doctor only"
    -when Entity= O & POS=F then "Facility only"
    -when Entity= I & POS=O then "Doctor & Facility"
    -when Entity= 0 & POS=0 then "Doctor & Facility"

In [None]:
#add a category column to the dataframe (code from Diego)
conditions = [(df_payments_combined.place_of_service == "O"),
             (df_payments_combined.entity_type_of_the_provider == "I") & (df_payments_combined.place_of_service == "F"),
             (df_payments_combined.entity_type_of_the_provider == "O") & (df_payments_combined.place_of_service == "F")
            ]
choices = ["Doctor & Facility" , 'Doctor only', 'Facility only']

#add column and call the condition
df_payments_combined["payment_type"] = np.select(conditions, choices, default = "unknown")

In [None]:
#this approach takes longer time
#add column to the dataframe
'''def label_type (row):
   if (row['entity_type_of_the_provider'] == 'I') and (row['place_of_service'] == 'F'):
      return "Doctor only"
   if (row['entity_type_of_the_provider'] == 'O') and (row['place_of_service'] == 'F'):
      return "Facility Only"
   if (row['entity_type_of_the_provider'] == 'I') and (row['place_of_service'] == 'O'):
      return "Doctor and Facility"
   if (row['entity_type_of_the_provider'] == 'O') and (row['place_of_service'] == 'O'):
      return "Doctor and Facility"
   else:
      return "Unknown"

df_payments_combined = df_payments_combined.assign(payment_type=df_payments_combined.apply(label_type, axis=1))'''

In [None]:
df_payments_combined.head()

In [None]:
df_payments_combined.to_csv('../data/allyearscombined.csv') #read it to file, to do tableau viztualization 

# Payments_type Analysis

Average payments for the grouped by columns

## UPDATE FRIDAY  6/5/2020. DON'T USE df_avg_pmt FILE. It contains many duplicates
- There are about 6,700 unique HCPCS codes. But there are 11k+ Doctor & Facility line items, grouped by HCPCS code per year. That number should be equal to or less than the total number of HCPCS codes.



In [None]:
df_avg_pmt = df_payments_combined.groupby(['year',
                                           'payment_type',
                                           'hcpcs_code']).average_medicare_allowed_amount.mean().to_frame()
df_avg_pmt.head().reset_index()

In [None]:
#pivot the year column
pivot_index = ['payment_type',
               'hcpcs_code',]

df_avg_pmt_pvt = df_avg_pmt.pivot_table(index = pivot_index,
                                                         columns='year',
                                                         values='average_medicare_allowed_amount').reset_index()

#turn the table to dataframe in order to change the the type of column names to 'object'
df_avg_pmt_pvt = pd.DataFrame(df_avg_pmt_pvt)
df_avg_pmt_pvt.head()

In [None]:
df_avg_pmt_pvt['change_2016_2015']= df_avg_pmt_pvt[2016]- df_avg_pmt_pvt[2015]
df_avg_pmt_pvt['change_2017_2016']= df_avg_pmt_pvt[2017]- df_avg_pmt_pvt[2016]
df_avg_pmt_pvt['change_2017_2015']= df_avg_pmt_pvt[2017]- df_avg_pmt_pvt[2015]

In [None]:
df_avg_pmt_pvt.head()

In [None]:
df_avg_pmt_pvt= df_avg_pmt_pvt[df_avg_pmt_pvt[2015].notna()]
df_avg_pmt_pvt= df_avg_pmt_pvt[df_avg_pmt_pvt[2017].notna()]

In [None]:
df_avg_pmt_pvt.info()

In [None]:
df_avg_pmt_pvt.head()

In [None]:
#melt the dataframe
col=[2015, 2016, 2017]
change= ['change_2016_2015', 'change_2017_2016','change_2017_2015']
df_avg_pmt_pvt_melt = df_avg_pmt_pvt.melt(df_avg_pmt_pvt,
                                                         id_vars= ['payment_type'],
                                                         value_vars= [(col)],
                                                         var_name= [('year')]
                                         )

In [None]:
df_avg_pmt_melt= pd.melt(df_avg_pmt_pvt,
                                                         id_vars= ['payment_type','hcpcs_code'],
                                                         value_vars= ['change_2016_2015','change_2017_2016','change_2017_2015'],
                                                         var_name= 'change'
                                         )

In [None]:
df_avg_pmt_melt.head()

In [None]:
df_payments_combined_1.sort_values('Doctor & Facility', ascending=False).head(10).reset_index()

In [None]:
df_avg_pmt.to_csv('../data/yearsPaymentTypeCode.csv') #read it to file, to do tableau viztualization 

In [None]:
df_payments_combined_1 = df_avg_pmt.groupby(['year','hcpcs_code','payment_type']).mean().squeeze().unstack()

In [None]:
df_payments_combined_1.head()

In [None]:
df_payments_combined_1.sort_values('Doctor & Facility', ascending=False).head(10).reset_index()


In [None]:
df_payments_combined_1.reset_index(level=0, inplace=True)

In [None]:
df_payments_combined_1.reset_index(level=0, inplace=True)

In [None]:
df_new

In [None]:
df_new_filter= df_new[['year','hcpcs_code','Doctor & Facility']]

In [None]:
df_new_filter.head()

In [None]:
df_max_doctor_and_facility = df_new_filter.sort_values('Doctor & Facility', ascending=False).head(10)
df_max_doctor_and_facility

In [None]:
duplicateDFRow = df_max_doctor_and_facility[df_max_doctor_and_facility.duplicated(['hcpcs_code'], keep = False)]
duplicateDFRow

In [None]:
df_new_doctor_only= df_new[['year','hcpcs_code','Doctor only']]

In [None]:
df_max_doctor_only = df_new_doctor_only.sort_values('Doctor only', ascending=False).head(10)
df_max_doctor_only

In [None]:
duplicateDFRow2 = df_max_doctor_only[df_max_doctor_only.duplicated(['hcpcs_code'], keep = False)]
duplicateDFRow2

In [None]:
df_new_Facility_only= df_new[['year','hcpcs_code','Facility only']]
df_max_Facility_only = df_new_Facility_only.sort_values('Facility only', ascending=False).head(10)
df_max_Facility_only

# Number of Beneficiary_Per Day analysis 

In [None]:
df_payments_combined.head()

Subset the dataframe to use the wanted columns

In [None]:
df_beneficiary_day=df_payments_combined[['year',
                       'national_provider_identifier',
                       'hcpcs_code',
                       'number_of_distinct_medicare_beneficiary_per_day_services'
                      
]]

In [None]:
df_beneficiary_day.head()

In [None]:
#count null 
df_beneficiary_day['number_of_distinct_medicare_beneficiary_per_day_services'].isnull().value_counts()

In [None]:
#sort by the largest beneficiary/day
df_beneficiary_day= df_beneficiary_day.sort_values('number_of_distinct_medicare_beneficiary_per_day_services',
                                                        ascending=False)

In [None]:
df_beneficiary_day.head()

In [None]:
print(df_beneficiary_day.shape)

In [None]:
#sort all rows for the same hcpcs_code
df_beneficiary_day[df_beneficiary_day.duplicated(['hcpcs_code'],
                                                     keep = False)].head()


In [None]:
print(df_beneficiary_day.shape)

In [None]:
#find mean of beficiary for hcpcs_code in every year
result = df_beneficiary_day.groupby(['year',
                                     'hcpcs_code'
                                     #'number_of_distinct_medicare_beneficiary_per_day_services',
                                    # 'payment_type'
                                    ])['number_of_distinct_medicare_beneficiary_per_day_services'].agg(['sum'])
result.reset_index()

In [None]:
result.to_csv('../data/allyearscombined_2.csv') #read it to file, to do tableau viztualization 

In [None]:
#get largest rows of hcpcs_code by beneficiary/day
df_beneficiary_Max =result.sort_values('sum',
                                        ascending=False)
df_beneficiary_Max.reset_index()

In [None]:
#drop na from mean column
df_beneficiary_Max= df_beneficiary_Max[df_beneficiary_Max['mean'].notna()].reset_index()

Pivot table

In [None]:
#pivot the year column
pivot_index = [
               'hcpcs_code',]

df_beneficiary_day_pivot = result.pivot_table(index=pivot_index,
                                                         columns='year',
                                                         values='sum').reset_index()

#turn the table to dataframe in order to change the the type of column names to 'object'
df_beneficiary_day_pivot = pd.DataFrame(df_beneficiary_day_pivot)
df_beneficiary_day_pivot.head()

In [None]:
df_beneficiary_day_pivot.columns

In [None]:
print(df_beneficiary_day_pivot.shape)

In [None]:
#drop na from the years column to get only hcpcs_code appears in all years
#df_beneficiary_day_pivot_2 = df_beneficiary_day_pivot.fillna(0)

In [None]:
#add tow columns to calculate changes in beneficiary per day
df_beneficiary_day_pivot['change_2017_2015']= df_beneficiary_day_pivot[2017] - df_beneficiary_day_pivot[2015] 
df_beneficiary_day_pivot['change_2016_2015']= df_beneficiary_day_pivot[2016] - df_beneficiary_day_pivot[2015]

In [None]:
df_beneficiary_day_pivot

In [None]:
#sort by the largest in 2015
df_beneficiary_day_pivot_2.sort_values(2015, ascending=False)

In [None]:
df_beneficiary_day_pivot_2.columns

In [None]:
df_beneficiary_day_pivot_2.head()

In [None]:
#test to check the groupby result
#Test1 =  df_beneficiary_day_pivot_2.loc[df_beneficiary_day_pivot_2['hcpcs_code'] == '0008M']
#Test1

In [None]:
#df_beneficiary_day_pivot_2['hcpcs_code'].value_counts()

In [None]:
df_beneficiary_day_pivot_3= df_beneficiary_day_pivot_2.groupby(['hcpcs_code']).mean().reset_index()


create data frame to include the top 100

In [None]:
#sorting head
df_max_top_10 =df_beneficiary_day_pivot_2.sort_values('change_2017_2015',ascending= False).head(10)

In [None]:
df_max_top_10.head()

create dataframe to include the 100 bottom

In [None]:
#sorting tail
df_bottom_change_2017_2015 =df_beneficiary_day_pivot_3.sort_values('change_2017_2015',ascending= False).tail(100)

In [None]:
df_bottom_change_2017_2015.head()

In [None]:
print(df_beneficiary_day_pivot_3.shape)

In [None]:
#df['hcpcs_code'].value_counts(ascending=False)

In [None]:
#df.head()

In [None]:
#test to check the groupby results
#Test2 =  df.loc[df['hcpcs_code'] == '84436']
#Test2

In [None]:
#melt the dataframe on year column in order to plot it using scatter plot
col=[2015, 2016, 2017]

df_beneficiary_day_melt= df_beneficiary_day_pivot.melt(df_beneficiary_day_pivot,
                                                         id_vars= "hcpcs_code",
                                                         value_vars= col,
                                                          var_name= 'year')

In [None]:
#select the top 100 and assign it to new data frame
df_beneficiary_day_melt.sort_values('value',ascending= False).head(100)

In [None]:
#assign it to plot_data
plot_data =df_beneficiary_day_melt.sort_values('value', ascending= False).head()

In [None]:
plot_data.head()

In [None]:
#chane the columns names
plot_data.columns= ['HCPCS Code', 'Year', 'Average Beneficiary Days']

In [None]:
#print the columns names
plot_data.columns

In [None]:
plot_data.head()

In [None]:
#plot scatter 
'''fig = px.scatter(plot_data, x ="Average Beneficiary Days", y ="HCPCS Code", color="Year",
                 title="Gender Earnings Disparity",
                 labels={"Average Beneficiary Days":"Annual Salary (in thousands)"} # customize axis label
                )
fig.show()'''