# Predict price changes using a classifier

We are here building a classifier that predict if the price of a drug will change based on:
- the month in the year
- the class of drug
- whether the drug is for acute or chronic disease
- the product form
- the volume sold in the year so far (to be done)
- Whether or not we are in the Covid period (to be done)

In [2]:
# Importing basic libraries
import pandas as pd
import numpy as np
import time
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,10)})
# sns.set_style('white')
plt.style.use('fivethirtyeight')
from tqdm import tqdm
tqdm.pandas()

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

  from pandas import Panel


In [34]:
dataframe = pd.read_csv("../Processed_Data/drugs_pct_changes_monthly_since2014_V3.csv")
dataframe.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,Changed
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,10,2014,,2014_10,,0
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,12,2014,22.0,2014_12,0.0,0
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,1,2015,21.0,2015_01,0.0,0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,2,2015,15.0,2015_02,0.0,0


In [35]:
# Skewed data 
print('The data in skewed. The percentage of records showing a price change is: ', dataframe.Changed.mean())

# Undersampling
samplesize = len(dataframe[dataframe.Changed == 1])
unchanged_indices = dataframe[dataframe.Changed == 0].index
random_indices = np.random.choice(unchanged_indices, samplesize, replace=False)
df = dataframe[dataframe.index.isin(random_indices) | dataframe.Changed==1].copy()

# Consequence
print(f"We went from {len(dataframe)} records to {len(df)} records to balance the data.")


The data in skewed. The percentage of records showing a price change is:  0.06354855296018805
We went from 272280 records to 34606 records to balance the data.


### Preprocessing

In [36]:
# We encode the month in a cyclyc form to take into account the cyclyc aspect in months
df['month_sin'] = round(np.sin((df.Month-1)*(2.*np.pi/12)), 2)
df['month_cos'] = round(np.cos((df.Month-1)*(2.*np.pi/12)), 2)

# Also creating a decoder dict for later
mth_decoder = dict()
for mth in np.arange(0, 12):
    mth_decoder[mth + 1] = (round(np.sin((mth)*(2.*np.pi/12)), 2), round(np.cos((mth)*(2.*np.pi/12)), 2))
mth_decoder = {v: k for k, v in mth_decoder.items()} 


df.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,Changed,month_sin,month_cos
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0,-0.87,0.5
6,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,4,2015,47.0,2015_04,0.0,0,1.0,0.0
7,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,265.9,5,2015,36.0,2015_05,0.08904,1,0.87,-0.5
14,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,287.4,12,2015,79.0,2015_12,0.080857,1,-0.5,0.87
22,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,313.0,8,2016,117.0,2016_08,0.089074,1,-0.5,-0.87


In [37]:
# Getting dummies for Acute
acute = pd.get_dummies(df['Acute/Chronic'], drop_first=False)
# Getting dummies for drug Class
clas = pd.get_dummies(df['Major Class'])   
# Getting dummies for product form
form = pd.get_dummies(df['Prod Form'])  
# Combining all the featuress
features = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1).columns
X = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1)
# Getting the dependent variable
y = df.Changed


# Train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Building Logistic regression classifier

In [48]:
classifier = LogisticRegression(C=1, max_iter=1000000)
classifier.fit(X_train, y_train)

LogisticRegression(C=1, max_iter=1000000)

In [49]:
y_pred = 1 * (classifier.predict_proba(X_test)[:, 1] > .5)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)


print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.63941057497833

Confusion Matrix: 
[[2099 1386]
 [1110 2327]]


### Building Naive Bayes classifier

In [50]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [51]:
y_pred = 1 * (classifier.predict_proba(X_test)[:, 1] > .5)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)


print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.5115573533660792

Confusion Matrix: 
[[ 235 3250]
 [ 131 3306]]


### Building Random Forest classifier

In [52]:
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=50, random_state=0)

In [53]:
y_pred = 1 * (classifier.predict_proba(X_test)[:, 1] > .5)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)


print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.7455937590291823

Confusion Matrix: 
[[2598  887]
 [ 874 2563]]


## Checking results

In [54]:
# Helper function to get back the dataframe from training array
def dum_to_normal(X_test):
    X_test_1 = X_test.iloc[:, 0:2][X_test.iloc[:, 0:2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Acute/Chronic'})
    X_test_1.index.name = None

    X_test_2 = X_test.iloc[:, 2:77][X_test.iloc[:, 1:76]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Major Class'})
    X_test_2.index.name = None

    X_test_3 = X_test.iloc[:, 77:-2][X_test.iloc[:, 77:-2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Prod Form'})
    X_test_3.index.name = None

    X_test_4 = X_test.iloc[:, -2:].apply(lambda x: mth_decoder[(x.month_sin, x.month_cos)], axis=1).to_frame().rename(columns={0: 'Month'})

    return pd.concat([X_test_1, X_test_2, X_test_3, X_test_4], axis=1)

In [56]:
# Getting back the dataframe
results = pd.concat([dum_to_normal(X_test), y_test], axis=1)
# Adding predictions
results['Changed_prediction'] = y_pred
results

Unnamed: 0,Acute/Chronic,Major Class,Prod Form,Month,Changed,Changed_prediction
26,CHRONIC,ANTIDIABETICS,INJECTABLES,12,0,1
32,CHRONIC,ANTIDIABETICS,INJECTABLES,6,1,1
84,CHRONIC,ANTIDIABETICS,INJECTABLES,12,1,1
102,CHRONIC,ANTIDIABETICS,INJECTABLES,6,1,0
154,CHRONIC,ANTIDIABETICS,INJECTABLES,12,1,0
...,...,...,...,...,...,...
272057,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,4,0,1
272152,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,4,1,1
272175,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,3,0,0
272230,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,7,0,0
