In [39]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import confusion_matrix

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/drug200.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
drug = pd.read_csv(file_content_stream)
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [40]:
## Frequency table of drug
drug['Drug'].value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [41]:
## Changing labels to numbers 
drug['Drug_numb'] = np.where(drug['Drug'] == 'drugA', 1, np.where(drug['Drug'] == 'drugB', 2, np.where(drug['Drug'] == 'drugC', 3, np.where(drug['Drug'] == 'drugX', 4, 5))))
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb
0,23,F,HIGH,HIGH,25.355,DrugY,5
1,47,M,LOW,HIGH,13.093,drugC,3
2,47,M,LOW,HIGH,10.114,drugC,3
3,28,F,NORMAL,HIGH,7.798,drugX,4
4,61,F,LOW,HIGH,18.043,DrugY,5


In [42]:
## Dummies of Sex
drug['Sex_numb'] = np.where(drug['Sex'] == 'F', 0, 1)

## Dummies of BP
drug = pd.concat([drug, pd.get_dummies(drug['BP'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'BP_HIGH', 'LOW': 'BP_LOW', 'NORMAL': 'BP_NORMAL'})

## Dummies of Cholesterol
drug = pd.concat([drug, pd.get_dummies(drug['Cholesterol'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'Cho_HIGH', 'NORMAL': 'Cho_'})

drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb,Sex_numb,BP_HIGH,BP_LOW,BP_NORMAL
0,23,F,HIGH,HIGH,25.355,DrugY,5,0,1,0,0
1,47,M,LOW,HIGH,13.093,drugC,3,1,0,1,0
2,47,M,LOW,HIGH,10.114,drugC,3,1,0,1,0
3,28,F,NORMAL,HIGH,7.798,drugX,4,0,0,0,1
4,61,F,LOW,HIGH,18.043,DrugY,5,0,0,1,0


In [30]:
drug.columns[8:11]

Index(['HIGH', 'LOW', 'NORMAL'], dtype='object')

In [31]:
len(drug.columns)

11

In [21]:
pd.get_dummies(drug['Cholesterol'])

Unnamed: 0,HIGH,NORMAL
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
195,1,0
196,1,0
197,1,0
198,0,1
