In [52]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/drug200.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
drug = pd.read_csv(file_content_stream)
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [53]:
## Frequency table of drug
drug['Drug'].value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [54]:
## Changing labels to numbers 
drug['Drug_numb'] = np.where(drug['Drug'] == 'drugA', 1, np.where(drug['Drug'] == 'drugB', 2, np.where(drug['Drug'] == 'drugC', 3, np.where(drug['Drug'] == 'drugX', 4, 5))))
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb
0,23,F,HIGH,HIGH,25.355,DrugY,5
1,47,M,LOW,HIGH,13.093,drugC,3
2,47,M,LOW,HIGH,10.114,drugC,3
3,28,F,NORMAL,HIGH,7.798,drugX,4
4,61,F,LOW,HIGH,18.043,DrugY,5


In [55]:
## Dummies of Sex
drug['Sex_numb'] = np.where(drug['Sex'] == 'F', 0, 1)

## Dummies of BP
drug = pd.concat([drug, pd.get_dummies(drug['BP'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'BP_HIGH', 'LOW': 'BP_LOW', 'NORMAL': 'BP_NORMAL'})

## Dummies of Cholesterol
drug = pd.concat([drug, pd.get_dummies(drug['Cholesterol'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'Cho_HIGH', 'NORMAL': 'Cho_NORMAL'})

drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb,Sex_numb,BP_HIGH,BP_LOW,BP_NORMAL,Cho_HIGH,Cho_NORMAL
0,23,F,HIGH,HIGH,25.355,DrugY,5,0,1,0,0,1,0
1,47,M,LOW,HIGH,13.093,drugC,3,1,0,1,0,1,0
2,47,M,LOW,HIGH,10.114,drugC,3,1,0,1,0,1,0
3,28,F,NORMAL,HIGH,7.798,drugX,4,0,0,0,1,1,0
4,61,F,LOW,HIGH,18.043,DrugY,5,0,0,1,0,1,0


In [56]:
## Defining the input and target variables
X = drug[['Age', 'Sex_numb', 'BP_HIGH', 'BP_LOW', 'Cho_HIGH', 'Na_to_K']]
Y = drug['Drug_numb']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [58]:
## Random forest
one_vs_rest_RF = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test data-frame
one_vs_rest_RF_pred = one_vs_rest_RF.predict(X_test)

## Computing the classification report
print(classification_report(Y_test, one_vs_rest_RF_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       1.00      0.91      0.95        11
           5       0.95      1.00      0.97        18

    accuracy                           0.97        40
   macro avg       0.99      0.98      0.99        40
weighted avg       0.98      0.97      0.97        40



In [62]:
## Adaboost 
one_vs_rest_Ada = OneVsRestClassifier(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01)).fit(X_train, Y_train)

## Predicting in the test data-frame
one_vs_rest_Ada_pred = one_vs_rest_Ada.predict(X_test)

## Computing the classification report
print(classification_report(Y_test, one_vs_rest_Ada_pred))

              precision    recall  f1-score   support

           1       1.00      0.80      0.89         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       0.91      0.91      0.91        11
           5       0.95      1.00      0.97        18

    accuracy                           0.95        40
   macro avg       0.97      0.94      0.95        40
weighted avg       0.95      0.95      0.95        40

