# Predict price changes using a classifier

In [75]:
# Importing basic libraries
import pandas as pd
import numpy as np
import time
import datetime

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,10)})
# sns.set_style('white')
plt.style.use('fivethirtyeight')
from tqdm import tqdm
tqdm.pandas()

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

  from pandas import Panel


In [51]:
df = pd.read_csv("../Processed_Data/drugs_pct_changes_monthly_since2014_V3.csv")
df.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,changed
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,10,2014,,2014_10,,0
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,12,2014,22.0,2014_12,0.0,0
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,1,2015,21.0,2015_01,0.0,0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,2,2015,15.0,2015_02,0.0,0


### Creating variables

In [52]:
df['month_sin'] = np.sin((df.Month-1)*(2.*np.pi/12))
df['month_cos'] = np.cos((df.Month-1)*(2.*np.pi/12))
df.head()

Unnamed: 0,NDC,Product,Major Class,Acute/Chronic,Prod Form,Pack Size,Pack Quantity,WAC,Month,Year,TRx,Date,Pct_change,changed,month_sin,month_cos
0,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,10,2014,,2014_10,,0,-1.0,-1.83697e-16
1,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,11,2014,6.0,2014_11,0.0,0,-0.866025,0.5
2,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,12,2014,22.0,2014_12,0.0,0,-0.5,0.8660254
3,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,1,2015,21.0,2015_01,0.0,0,0.0,1.0
4,2143301,TRULICITY 10/2014 LLY,ANTIDIABETICS,CHRONIC,INJECTABLES,1,0.5,244.16,2,2015,15.0,2015_02,0.0,0,0.5,0.8660254


In [141]:
acute = pd.get_dummies(df['Acute/Chronic'], drop_first=False)
clas = pd.get_dummies(df['Major Class'])   
form = pd.get_dummies(df['Prod Form'])   
features = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1).columns
X = pd.concat([acute, clas, form, df['month_sin'], df['month_cos']], axis=1)
y = df.changed

### Train/Test

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Simple logistic Regression

In [143]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [144]:
y_pred = classifier.predict(X_test)
acc = np.average(y_pred == y_test)
cm = confusion_matrix(y_test, y_pred)


print(f'Accuracy: {acc}')
print()
print(f"Confusion Matrix: \n{cm}")

Accuracy: 0.9346628470691934

Confusion Matrix: 
[[50727   153]
 [ 3405   171]]


### Checking results

In [101]:
res = pd.concat([X_test, y_test], axis=1)
res['changed_prediction'] = y_pred

In [177]:
X_test_1 = X_test.iloc[:, 0:2][X_test.iloc[:, 0:2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Acute/Chronic'})
X_test_1.index.name = None

X_test_2 = X_test.iloc[:, 2:77][X_test.iloc[:, 1:76]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Major Class'})
X_test_2.index.name = None

X_test_3 = X_test.iloc[:, 77:-2][X_test.iloc[:, 77:-2]==1].stack().reset_index().drop(0,1).set_index('level_0').rename(columns={'level_1': 'Prod Form'})
X_test_3.index.name = None

# TODO: Retrieve value of months from sin and cos

In [170]:
cos_sin_test = X_test.iloc[:, -2:]
cos_sin_test

Unnamed: 0,month_sin,month_cos
235894,-8.660254e-01,-5.000000e-01
119713,1.000000e+00,6.123234e-17
160095,5.000000e-01,8.660254e-01
240442,1.000000e+00,6.123234e-17
118808,-5.000000e-01,-8.660254e-01
...,...,...
152446,5.000000e-01,-8.660254e-01
32569,1.224647e-16,-1.000000e+00
166752,1.224647e-16,-1.000000e+00
193330,8.660254e-01,-5.000000e-01
