# Computing percent changes for each drug per month

In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import time
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,10)})
plt.style.use('fivethirtyeight')
from tqdm import tqdm

### Loading Data

In [2]:
# Loading the dataset
df = pd.read_csv('../Processed_Data/merge_with_iqvia_2017_01_2020_09_monthly.csv')
df = df[['NDC', 'Product','Major Class', 'Acute/Chronic', 'Prod Form', 'WAC', 'Month', 'Year', 'TRx']]
df = df.dropna(subset=['WAC'])
df['Date'] = df.Year.astype(str) + '_' + df.Month.astype(str).apply(lambda x: '0' + x if int(x)<10 else x)
df.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,WAC,Month,Year,TRx,Date
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,12,2017,240,2017_12
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,11,2017,258,2017_11
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,10,2017,175,2017_10
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,9,2017,190,2017_09
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,8,2017,176,2017_08


### 1. Creating percent changes per month for each drug

Need to check when dropping duplicates. Some drugs have same name, same form, but **different NDC and different prices!**

In [10]:
# Sorting values
df = df.sort_values(['NDC', 'Date']).reset_index(drop=True)

# List of unique NDC 
NDCs = df.NDC.unique()

# Empty Dataframe to store new data
df_pct = pd.DataFrame(columns=df.columns)

# For each drug, compute pct change and append to dataframe
for NDC in tqdm(NDCs):
    df_temp = df[df.NDC == NDC].copy()
    df_temp['Pct_change'] = df_temp.WAC.pct_change()
    df_pct = pd.concat((df_pct, df_temp))

print("Shape: ", df_pct.shape)
df_pct.head(10)

100%|██████████| 4632/4632 [02:23<00:00, 32.26it/s]

Shape:  (146180, 11)





Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,WAC,Month,Year,TRx,Date,Pct_change
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,1,2017,132,2017_01,
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,2,2017,144,2017_02,0.0
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,3,2017,152,2017_03,0.0
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,4,2017,145,2017_04,0.0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,5,2017,176,2017_05,0.0
5,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,6,2017,197,2017_06,0.079872
6,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,7,2017,161,2017_07,0.0
7,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,8,2017,176,2017_08,0.0
8,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,9,2017,190,2017_09,0.0
9,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,10,2017,175,2017_10,0.0


### 2. Adding boolean to check if price has increased

In [13]:
df_pct = df_pct.dropna()
df_pct['changed'] = (df_pct.Pct_change != 0) * 1
df_pct.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,WAC,Month,Year,TRx,Date,Pct_change,changed
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,2,2017,144,2017_02,0.0,0
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,3,2017,152,2017_03,0.0,0
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,4,2017,145,2017_04,0.0,0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,5,2017,176,2017_05,0.0,0
5,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,338.0,6,2017,197,2017_06,0.079872,1


### 3. Saving

In [14]:
# Saving dataset
df_pct.to_csv('drugs_pct_changes_TRX_monthly.csv', index=False)

## Another method

In [69]:
# Pivoting to get WAC per date for each drug in a row
df_pivot = df.pivot_table(values=['WAC'], index=['NDC', 'Product', 'Major Class', 'Acute/Chronic', 'Prod Form'], columns='Date')
df_pivot.columns = df_pivot.columns.droplevel(0)
df_pivot = df_pivot.rename_axis(None, axis=1)
df_pivot = df_pivot.reset_index()
df_pivot.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,2017_01,2017_02,2017_03,2017_04,2017_05,...,2019_10,2019_11,2019_12,2020_01,2020_02,2020_03,2020_04,2020_05,2020_06,2020_07
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,313.0,313.0,313.0,313.0,...,379.7,379.7,379.7,379.7,379.7,398.65,398.65,398.65,398.65,398.65
1,2143380,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,313.0,313.0,313.0,313.0,...,379.7,379.7,379.7,379.7,379.7,398.65,398.65,398.65,398.65,398.65
2,2143401,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,313.0,313.0,313.0,313.0,...,379.7,379.7,379.7,379.7,379.7,398.65,398.65,398.65,398.65,398.65
3,2143480,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,313.0,313.0,313.0,313.0,313.0,...,379.7,379.7,379.7,379.7,379.7,398.65,398.65,398.65,398.65,398.65
4,2143601,EMGALITY 10/2018 LLY,PAIN,ACUTE,INJECTABLES,,,,,,...,575.0,575.0,575.0,575.0,575.0,603.6,603.6,603.6,603.6,603.6


In [70]:
# Computing percent changes between two consecutive months  
percent_changes = df_pivot[['NDC', 'Product', 'Major Class', 'Acute/Chronic', 'Prod Form']].copy()
prices = df_pivot.drop(labels=percent_changes.columns, axis=1).copy()

# For each column compute percent change
for i in range(len(prices.columns) - 1):
    pct_change = (prices.iloc[:, i+1] - prices.iloc[:, i]) / prices.iloc[:, i]
    percent_changes[f'{prices.iloc[:, i+1].name}'] = pct_change

percent_changes.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,2017_02,2017_03,2017_04,2017_05,2017_06,...,2019_10,2019_11,2019_12,2020_01,2020_02,2020_03,2020_04,2020_05,2020_06,2020_07
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.049908,0.0,0.0,0.0,0.0
1,2143380,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.049908,0.0,0.0,0.0,0.0
2,2143401,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.049908,0.0,0.0,0.0,0.0
3,2143480,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.049908,0.0,0.0,0.0,0.0
4,2143601,EMGALITY 10/2018 LLY,PAIN,ACUTE,INJECTABLES,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.049739,0.0,0.0,0.0,0.0


Be careful when computing the average pct_change for the different drugs,  Some drugs have same name, same form, but **different NDC and different prices!**

In [76]:
# Averaging Percent change over all duplicated druges (multiple NDC for same drug)
percent_changes = pd.merge(percent_changes[['NDC', 'Product', 'Major Class', 'Acute/Chronic', 'Prod Form']].drop_duplicates(subset='Product'),
                           percent_changes.groupby(['Product']).mean().reset_index().drop('NDC', axis=1),
                           left_on='Product', right_on='Product', how='left')
percent_changes.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,2017_02,2017_03,2017_04,2017_05,2017_06,...,2019_10,2019_11,2019_12,2020_01,2020_02,2020_03,2020_04,2020_05,2020_06,2020_07
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.049908,0.0,0.0,0.0,0.0
1,2143601,EMGALITY 10/2018 LLY,PAIN,ACUTE,INJECTABLES,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.049739,0.0,0.0,0.0,0.0
2,2144501,TALTZ 04/2016 LLY,IMMUNOLOGY,CHRONIC,INJECTABLES,0.0,0.0,0.0,0.0,0.068998,...,0.0,0.0,0.0,0.0,0.0,0.059985,0.0,0.0,0.0,0.0
3,2300475,PROZAC WEEKLY 03/2001 LLY,MENTAL HEALTH,CHRONIC,ORALS,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,2322730,STRATTERA 12/2002 LLY,ADHD,CHRONIC,ORALS,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# percent_changes.to_csv('drugs_pct_changes_monthly.csv', index=False)

### Creating boolean for positive pct changes

In [None]:
bool_changes = percent_changes
for column in range(5, percent_changes.shape[1]):
    bool_changes.iloc[:, column] = bool_changes.iloc[:, column].apply(lambda x: 1 if x > 0 else 0)
    
bool_changes.head()

In [None]:
# bool_changes.to_csv('drugs_bool_changes_monthly.csv', index=False)

# The issue

In [98]:
df[(df['Product'] == 'ZYVOX 04/2000 PFZ') & (df['Prod Form'] == 'ORALS') & (df['Date'] == '2017_02')]

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,WAC,Month,Year,TRx,Date
19987,9513601,ZYVOX 04/2000 PFZ,ANTIBACTERIALS,ACUTE,ORALS,7.43093,2,2017,24,2017_02
20030,9513802,ZYVOX 04/2000 PFZ,ANTIBACTERIALS,ACUTE,ORALS,222.92,2,2017,228,2017_02
20073,9513803,ZYVOX 04/2000 PFZ,ANTIBACTERIALS,ACUTE,ORALS,222.91866,2,2017,82,2017_02
