In [7]:
import numpy as np
import pandas as pd
from fbprophet import Prophet
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import math  

In [8]:
def history(class_name):
    df = pd.read_csv('drugs_pct_changes_monthly.csv') ## contain data from 2014-08-01 to 2020-07-01
    df.loc[df['Estimated LOE Date'] == 'Unspecified', 'Estimated LOE Date'] = 'May-24'
    df['Estimated LOE Date'] = pd.to_datetime(df['Estimated LOE Date'], format='%b-%y')
    df = df[df['Estimated LOE Date'] >= pd.to_datetime('2020-07-01')]
    df = df[df['Major Class'] == class_name]
    def fun(row):
        return str(row['Year'])+'-' + str(row['Month'])
    df['Date'] = df.apply(lambda x:fun(x),axis = 1)
    df.Date = pd.to_datetime(df.Date)
    df.sort_values(['Date'],inplace = True)
    res = df.groupby('Date')['Changed'].mean().to_frame()
    res.rename(columns={'Changed':'Percentage of drugs change price'},inplace = True)
    return res

In [9]:
def fun(row):
    return str(row['Year'])+'-' + str(row['Month'])

In [10]:
def predict_pct_month(df):
        date = max(df.ds)
        prophet = Prophet(changepoint_prior_scale=0.05, daily_seasonality=False,weekly_seasonality=False,yearly_seasonality=True)
        prophet.fit(df)
        build_forecast = prophet.make_future_dataframe(periods=12, freq='M')
        forecast = prophet.predict(build_forecast)
        forecast = forecast[forecast.ds>=date]
        forecast.reset_index(inplace = True)
        forecast['pct_change'] = forecast.yhat.apply(lambda x:(x-df.iloc[-1].y)/df.iloc[-1].y)
        ##month_ = forecast.iloc[forecast['pct_change'].idxmax()].ds.month
        previous_year = df[df.ds>= '2019-08-01']
        previous_year.reset_index(inplace = True)
        previous_year['pct_change'] = previous_year.y.pct_change()
        ## Use this year's change point as prediction for next year
        ## If this year the drug price does not increase, go back to previous year
        if previous_year['pct_change'].mean() == 0:
            previous_year = df[df.ds>= '2018-08-01']
            previous_year.reset_index(inplace = True)
            previous_year['pct_change'] = previous_year.y.pct_change()
            month_ = previous_year.iloc[previous_year['pct_change'].idxmax()].ds.month
        else:
            month_ = previous_year.iloc[previous_year['pct_change'].idxmax()].ds.month
        pct_ = forecast[forecast['pct_change']>0.01]['pct_change'].mean() ## Take average pct_change to filter out fluctuations
        if math.isnan(pct_):
            pct_ = 0
        return month_,pct_

In [11]:
def pipe(df):
        res = []
        for NDC in df.NDC.unique():
            try:
                temp = df[df.NDC == NDC][['Date','WAC']]
                temp.columns = ['ds','y']
                a,b = predict_pct_month(temp)
                res.append([NDC,a,b])
            except:
                print(NDC)
        return res

In [12]:
def pred(class_name):
    df = pd.read_csv('monthly_2014_8_2020_7_4k_drugs.csv')
    df['Date'] = df.apply(lambda x:fun(x),axis = 1)
    df.Date = pd.to_datetime(df.Date)
    df.sort_values(['Date'],inplace = True)

    ## Filter two
    df.dropna(subset=['TRx'],inplace = True)
    ## Filter one
    list1 = df[(df.Year == 2020) & (df.Month == 7)].NDC.unique()
    list2 = df[(df.Year == 2014) & (df.Month == 8)].NDC.unique()
    selected_NDC = list(set(list1).intersection(list2)) 
    df = df[df.NDC.isin(selected_NDC)]
    ## Filter three
    df.loc[df['Estimated LOE Date'] == 'Unspecified', 'Estimated LOE Date'] = 'May-24'
    df['Estimated LOE Date'] = pd.to_datetime(df['Estimated LOE Date'], format='%b-%y')
    df = df[df['Estimated LOE Date'] >= pd.to_datetime('2020-07-01')]

    ## Drop duplicates
    df = df[~df.duplicated()]
    df.drop_duplicates(subset=df.columns.difference(['Manufacturer']),inplace = True)
    
    df = df[df['Major Class'] == class_name]
    num_ = len(df)

    
    res = pipe(df)
    res = pd.DataFrame(res,columns=['NDC','pred_change_month','pred_change_rate'])
    res = res.merge(df[['NDC','Major Class','Manufacturer']], left_on='NDC', right_on='NDC', how='left')
    res = res[~res.duplicated()]
    return res,num_
    

In [13]:
def merge(class_name):
    temp,num_ = pred(class_name)
    len_ = len(temp)
    temp = temp.groupby('pred_change_month')['NDC'].count().to_frame()
    temp = temp/len_
    temp.rename(columns = {'NDC':'Percentage of drugs change price'},inplace = True)
    temp.index = pd.to_datetime(temp.index.to_series().apply(lambda x:'2020-'+str(x)+'-01' if x>=8 else '2021-'+str(x)+'-01'))
    
    test = history(class_name)
    dates= pd.date_range('2020-08-01','2021-07-01' , freq='1M')-pd.offsets.MonthBegin(1)
    dates = dates.to_series()
    test = pd.concat([test, dates])
    test.drop(0,axis = 1,inplace = True)
    
    test.where(~np.isnan(test), other = temp, inplace = True)
    test.fillna(0,inplace = True)
    test['Number'] = test['Percentage of drugs change price'].apply(lambda x:round(x*num_,0))
    file_name = class_name + '.csv'
    test.to_csv(file_name)
    

In [14]:
for class_ in ['MULTIPLE SCLEROSIS','NERVOUS SYSTEM DISORDERS','OTHER CNS','GI PRODUCTS','HEMATOPOIETIC GROWTH FACTORS','OTHER HAEMATOLOGICALS']:
    merge(class_)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [15]:
for class_ in ['ONCOLOGICS',
'DERMATOLOGICS',
'IMMUNOLOGY',
'RESP ANTIVIRALS EX FLU',
'OPHTHALMOLOGY, GENERAL',
'ALLERGY, SYSTEMIC & NASAL']:
    merge(class_)