# Predict price changes using a classifier

We are here building a classifier that predict if the price of a drug will change based on:
- the month in the year (cos/sine form)
- the class of drug
- whether the drug is for acute or chronic disease
- the product form
- the total cumulated volume sold in the year so far
- The volume sold in the previous month
- Whether or not we are in the Covid period

In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import time
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,10)})
# sns.set_style('white')
plt.style.use('fivethirtyeight')
from tqdm import tqdm
tqdm.pandas()

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

  from pandas import Panel


In [2]:
dataframe = pd.read_csv("../Processed_Data/drugs_pct_changes_monthly.csv")
dataframe.head()

Unnamed: 0,NDC,Manufacturer,Product,Product Launch Date,Estimated LOE Date,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,Date,Year,Month,TRx,Previous_TRx,TRx_cumulative,WAC,Pct_change,Changed
0,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,2014_10,2014,10,0,,0.0,244.16,,0
1,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,2014_11,2014,11,6,0.0,6.0,244.16,0.0,0
2,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,2014_12,2014,12,22,6.0,28.0,244.16,0.0,0
3,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,2015_01,2015,1,21,22.0,21.0,244.16,0.0,0
4,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,2015_02,2015,2,15,21.0,36.0,244.16,0.0,0


In [3]:
# Skewed data 
print('The data in skewed. The percentage of records showing a price change is: ', dataframe.Changed.mean())

# Undersampling
samplesize = len(dataframe[dataframe.Changed == 1])
unchanged_indices = dataframe[dataframe.Changed == 0].index
random_indices = np.random.choice(unchanged_indices, samplesize, replace=False)
df = dataframe[dataframe.index.isin(random_indices) | dataframe.Changed==1].copy()

# Consequence
print(f"We went from {len(dataframe)} records to {len(df)} records to balance the data.")


The data in skewed. The percentage of records showing a price change is:  0.0699640650533673
We went from 242939 records to 33994 records to balance the data.


### Preprocessing

In [4]:
# We add a COVID feature which tells us if we are post or pre covid
df['Covid'] = (pd.to_datetime(df.Date, format='%Y_%m') > pd.Timestamp('2020-03-01')) * 1

# We encode the month in a cyclyc form to take into account the cyclyc aspect in months
df['Month_sin'] = round(np.sin((df.Month-1)*(2.*np.pi/12)), 2)
df['Month_cos'] = round(np.cos((df.Month-1)*(2.*np.pi/12)), 2)

# Also creating a decoder dict for later
mth_decoder = dict()
for mth in np.arange(0, 12):
    mth_decoder[mth + 1] = (round(np.sin((mth)*(2.*np.pi/12)), 2), round(np.cos((mth)*(2.*np.pi/12)), 2))
mth_decoder = {v: k for k, v in mth_decoder.items()} 

df.head()

Unnamed: 0,NDC,Manufacturer,Product,Product Launch Date,Estimated LOE Date,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,...,Month,TRx,Previous_TRx,TRx_cumulative,WAC,Pct_change,Changed,Covid,Month_sin,Month_cos
7,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,...,5,36,47.0,152.0,265.9,0.08904,1,0,0.87,-0.5
14,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,...,12,79,74.0,600.0,287.4,0.080857,1,0,-0.5,0.87
17,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,...,3,106,102.0,318.0,287.4,0.0,0,0,0.87,0.5
22,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,...,8,117,129.0,916.0,313.0,0.089074,1,0,-0.5,-0.87
32,2143301,Eli Lilly and Co,TRULICITY 10/2014 LLY,10/1/2014 0:00,Sep-26,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,...,6,197,176.0,946.0,338.0,0.079872,1,0,0.5,-0.87


In [5]:
# We have some NaN values from the pct_change and Previous_TRx column (due to the shift). We drop them
df = df.dropna()

# Getting dummies for Acute
acute = pd.get_dummies(df['Acute/Chronic'], drop_first=False)
# Getting dummies for drug Class
clas = pd.get_dummies(df['Major Class'])   
# Getting dummies for product form
form = pd.get_dummies(df['Prod Form'])  
# Combining all the features
features = pd.concat([acute, clas, form, df['Month_sin'], df['Month_cos']], axis=1).columns
X = pd.concat([acute, clas, form, df['Month_sin'], df['Month_cos'], df['Covid'], df['Previous_TRx'], df['TRx_cumulative']], axis=1)
# Getting the dependent variable
y = df.Changed

# Train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Building Logistic regression classifier

In [6]:
log_reg_classifier = LogisticRegression(C=1, max_iter=1000000)
log_reg_classifier.fit(X_train, y_train)

LogisticRegression(C=1, max_iter=1000000)

In [7]:
y_pred = 1 * (log_reg_classifier.predict_proba(X_test)[:, 1] > .5)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.6337546468401487

Confusion Matrix: 
[[1697 1594]
 [ 869 2565]]


### Building Random Forest classifier

In [8]:
rf_classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=50, random_state=0)

In [9]:
y_pred = 1 * (rf_classifier.predict_proba(X_test)[:, 1] > .6)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.7305576208178438

Confusion Matrix: 
[[2593  698]
 [1114 2320]]


### Checking results

In [10]:
# Getting back the dataframe
results = df.loc[X_test.index]
# Adding predictions
results = results[['NDC', 'Product', 'Product Launch Date',
       'Estimated LOE Date', 'Major Class', 'Acute/Chronic', 'Prod Form',
       'Date', 'TRx', 'Previous_TRx', 'TRx_cumulative', 'WAC', 'Pct_change', 'Covid', 'Changed']]
results['Changed_prediction'] = y_pred

In [11]:
results

Unnamed: 0,NDC,Product,Product Launch Date,Estimated LOE Date,Major Class,Acute/Chronic,Prod Form,Date,TRx,Previous_TRx,TRx_cumulative,WAC,Pct_change,Covid,Changed,Changed_prediction
48734,49275066,CARDURA 01/1991 PFZ,1/1/1991 0:00,Oct-00,"ANTIHYPERTENSIVES, PLAIN & COMBO",ACUTE,ORALS,2016_01,51,69.0,51.0,3.17220,0.120008,0,1,1
14731,6046106,EMEND 04/2003 MSD,4/1/2003 0:00,Apr-15,"CANCER DETOX AG, ANTI-NAUSEANTS",ACUTE,ORALS,2015_06,327,324.0,1958.0,152.44333,0.089997,0,1,1
238616,70515010910,BETAPACE 12/1992 CV1,12/1/1992 0:00,Oct-99,"ANTIHYPERTENSIVES, PLAIN & COMBO",CHRONIC,ORALS,2017_12,20,27.0,47.0,24.47440,0.089999,0,1,0
63180,71000740,DILANTIN 09/1976 PFZ,9/1/1976 0:00,Unspecified,NERVOUS SYSTEM DISORDERS,CHRONIC,ORALS,2016_10,35,37.0,502.0,1.25270,0.000000,0,0,0
137952,10631009671,HALOG 11/1975 SPI,11/1/1975 0:00,Unspecified,DERMATOLOGICS,ACUTE,DERMATOLOGICALS,2020_06,10,9.0,104.0,12.83433,0.000000,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105462,186198804,PULMICORT RESPULES 09/2000 AZN,9/1/2000 0:00,Feb-15,RESPIRATORY AGENTS,CHRONIC,INHALANTS,2016_05,8801,9546.0,51022.0,4.35766,0.000000,0,0,0
62788,69600121,CEREBYX 09/1996 PFZ,9/1/1996 0:00,Jul-07,NERVOUS SYSTEM DISORDERS,CHRONIC,INJECTABLES,2015_01,5,2.0,5.0,7.91340,0.100000,0,1,1
145391,27437020311,SUPRAX 03/2004 LU.,3/1/2004 0:00,Nov-02,ANTIBACTERIALS,ACUTE,ORALS,2016_02,466,326.0,792.0,19.52900,0.000000,0,0,1
11762,6003220,STROMECTOL 04/1997 MSD,4/1/1997 0:00,Apr-01,"ANTI-PARASITICS, ANTIMALARIALS, INSECTICIDES",ACUTE,ORALS,2015_02,7746,10723.0,18469.0,4.65250,0.000000,0,0,1


# To Do: Parameter tuning and feature selection