# Predict price changes using a classifier

In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import time
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,10)})
# sns.set_style('white')
plt.style.use('fivethirtyeight')
from tqdm import tqdm
tqdm.pandas()

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

  from pandas import Panel


In [2]:
df = pd.read_csv("../Processed_Data/drugs_pct_changes_monthly_since2014_V3.csv")
df.head(30)

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,changed
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,10,2014,,2014_10,,0
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,12,2014,22.0,2014_12,0.0,0
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,1,2015,21.0,2015_01,0.0,0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,2,2015,15.0,2015_02,0.0,0
5,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,3,2015,33.0,2015_03,0.0,0
6,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,4,2015,47.0,2015_04,0.0,0
7,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,265.9,5,2015,36.0,2015_05,0.08904,1
8,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,265.9,6,2015,54.0,2015_06,0.0,0
9,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,265.9,7,2015,55.0,2015_07,0.0,0


In [3]:
df.changed.mean()

0.06354855296018805

### Creating variables

In [4]:
# We encode the month in a cyclyc form to take into account the cycle in months
df['month_sin'] = round(np.sin((df.Month-1)*(2.*np.pi/12)), 2)
df['month_cos'] = round(np.cos((df.Month-1)*(2.*np.pi/12)), 2)

# Also creating a decoder dict for later
mth_decoder = dict()
for mth in np.arange(0, 12):
    mth_decoder[mth + 1] = (round(np.sin((mth)*(2.*np.pi/12)), 2), round(np.cos((mth)*(2.*np.pi/12)), 2))
mth_decoder = {v: k for k, v in mth_decoder.items()} 


df.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,changed,month_sin,month_cos
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,10,2014,,2014_10,,0,-1.0,-0.0
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0,-0.87,0.5
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,12,2014,22.0,2014_12,0.0,0,-0.5,0.87
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,1,2015,21.0,2015_01,0.0,0,0.0,1.0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,2,2015,15.0,2015_02,0.0,0,0.5,0.87


In [5]:
# Getting dummies for Acute
acute = pd.get_dummies(df['Acute/Chronic'], drop_first=False)

# Getting dummies for drug Class
clas = pd.get_dummies(df['Major Class'])   

# Getting dummies for product form
form = pd.get_dummies(df['Prod Form'])  

# Combining all the featuress
features = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1).columns
X = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1)

# Getting the dependent variable
y = df.changed

### Train/Test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Simple logistic Regression

In [7]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [8]:
y_pred = 1 * (classifier.predict_proba(X_test)[:, 1] > .4)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)


print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.9325326869399148

Confusion Matrix: 
[[50331   549]
 [ 3125   451]]


### Checking results

In [76]:
def dum_to_normal(X_test):
    X_test_1 = X_test.iloc[:, 0:2][X_test.iloc[:, 0:2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Acute/Chronic'})
    X_test_1.index.name = None

    X_test_2 = X_test.iloc[:, 2:77][X_test.iloc[:, 1:76]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Major Class'})
    X_test_2.index.name = None

    X_test_3 = X_test.iloc[:, 77:-2][X_test.iloc[:, 77:-2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Prod Form'})
    X_test_3.index.name = None

    X_test_4 = X_test.iloc[:, -2:].apply(lambda x: mth_decoder[(x.month_sin, x.month_cos)], axis=1).to_frame().rename(columns={0: 'Month'})

    return pd.concat([X_test_1, X_test_2, X_test_3, X_test_4], axis=1)

In [77]:
res = pd.concat([dum_to_normal(X_test), y_test], axis=1)
res['changed_prediction'] = y_pred
res

Unnamed: 0,Acute/Chronic,Major Class,Prod Form,Month,changed,changed_prediction
5,CHRONIC,ANTIDIABETICS,INJECTABLES,3,0,0
13,CHRONIC,ANTIDIABETICS,INJECTABLES,11,0,0
14,CHRONIC,ANTIDIABETICS,INJECTABLES,12,1,1
18,CHRONIC,ANTIDIABETICS,INJECTABLES,4,0,0
23,CHRONIC,ANTIDIABETICS,INJECTABLES,9,0,0
...,...,...,...,...,...,...
272247,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,3,0,0
272254,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,7,0,0
272262,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,11,0,0
272264,ACUTE,DERMATOLOGICS,DERMATOLOGICALS,12,0,0


In [86]:
print()
print()
print()
print('Accuracy for actual change: ', (res[res.changed == 1].changed == res[res.changed == 1].changed_prediction).mean())
print()
print()
print()




Accuracy for actual change:  0.08864653243847875



