In [2]:
import numpy as np
import pandas as pd
from fbprophet import Prophet
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import math  

# Feature Engineering

In [15]:
df = pd.read_csv('monthly_2014_8_2020_7_4k_drugs.csv')
def fun(row):
    return str(row['Year'])+'-' + str(row['Month'])
df['Date'] = df.apply(lambda x:fun(x),axis = 1)
df.Date = pd.to_datetime(df.Date)
df.sort_values(['Date'],inplace = True)

# Filter
1. Drugs have WAC data from 2014.8 to 2020.7
2. Drugs have TRx data from 2014.8 to 2020.7
3. Drop duplicates (one NDC corresponding to two manufacturers)

In [16]:
## Filter two
df.dropna(subset=['TRx'],inplace = True)
## Filter one
list1 = df[(df.Year == 2020) & (df.Month == 7)].NDC.unique()
list2 = df[(df.Year == 2014) & (df.Month == 8)].NDC.unique()
selected_NDC = list(set(list1).intersection(list2)) 
df = df[df.NDC.isin(selected_NDC)]

In [17]:
## Drop duplicates
df = df[~df.duplicated()]
df.drop_duplicates(subset=df.columns.difference(['Manufacturer']),inplace = True)

# Modeling

In [18]:
def predict_pct_month(df):
        date = max(df.ds)
        prophet = Prophet(changepoint_prior_scale=0.05, daily_seasonality=False,weekly_seasonality=False,yearly_seasonality=True)
        prophet.fit(df)
        build_forecast = prophet.make_future_dataframe(periods=12, freq='M')
        forecast = prophet.predict(build_forecast)
        forecast = forecast[forecast.ds>=date]
        forecast.reset_index(inplace = True)
        forecast['pct_change'] = forecast.yhat.apply(lambda x:(x-df.iloc[-1].y)/df.iloc[-1].y)
        ##month_ = forecast.iloc[forecast['pct_change'].idxmax()].ds.month
        previous_year = df[df.ds>= '2019-08-01']
        previous_year.reset_index(inplace = True)
        previous_year['pct_change'] = previous_year.y.pct_change()
        ## Use this year's change point as prediction for next year
        ## If this year the drug price does not increase, go back to previous year
        if previous_year['pct_change'].mean() == 0:
            previous_year = df[df.ds>= '2018-08-01']
            previous_year.reset_index(inplace = True)
            previous_year['pct_change'] = previous_year.y.pct_change()
            month_ = previous_year.iloc[previous_year['pct_change'].idxmax()].ds.month
        else:
            month_ = previous_year.iloc[previous_year['pct_change'].idxmax()].ds.month
        pct_ = forecast[forecast['pct_change']>0.01]['pct_change'].mean() ## Take average pct_change to filter out fluctuations
        if math.isnan(pct_):
            pct_ = 0
        return month_,pct_
   

In [19]:
def pipe(df):
    res = []
    for NDC in df.NDC.unique():
        try:
            temp = df[df.NDC == NDC][['Date','WAC']]
            temp.columns = ['ds','y']
            a,b = predict_pct_month(temp)
            res.append([NDC,a,b])
        except:
            print(NDC)
    return res
            

In [24]:
res = pipe(df)

INFO:fbprophet:n_changepoints greater than number of observations. Using 12.


61570011001


In [25]:
res = pd.DataFrame(res,columns=['NDC','pred_change_month','pred_change_rate'])
res = res.merge(df[['NDC','Major Class','Manufacturer']], left_on='NDC', right_on='NDC', how='left')
res = res[~res.duplicated()]

In [7]:
res.to_csv('pred_results_drug_level_analysis.csv')

# Aggregate on class

In [4]:
res.groupby('Major Class').mean().to_csv('1.csv')

In [5]:
res.groupby('Major Class')['pred_change_month'].agg(pd.Series.mode).reset_index().to_csv('2.csv')