In [11]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/Iris.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
iris = pd.read_csv(file_content_stream)
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
## Creating frequency table of Species
iris['Species'].value_counts() 

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: Species, dtype: int64

In [13]:
## Creating Species_numb
iris['Species_numb'] = np.where(iris['Species'] == 'Iris-virginica', 1, np.where(iris['Species'] == 'Iris-versicolor', 2, 3))
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_numb
0,1,5.1,3.5,1.4,0.2,Iris-setosa,3
1,2,4.9,3.0,1.4,0.2,Iris-setosa,3
2,3,4.7,3.2,1.3,0.2,Iris-setosa,3
3,4,4.6,3.1,1.5,0.2,Iris-setosa,3
4,5,5.0,3.6,1.4,0.2,Iris-setosa,3


In [14]:
## Defining the input and target variables
X = iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
Y = iris['Species_numb']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [15]:
## Standardizing the data
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [16]:
## Building the multi-class classification model with RF
one_vs_all_RF = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test data-frame
one_vs_all_RF_pred = one_vs_all_RF.predict_proba(X_test)
one_vs_all_RF_pred

array([[9.57260938e-04, 9.85108524e-01, 1.39342150e-02],
       [5.26007059e-01, 4.73992941e-01, 0.00000000e+00],
       [4.26687249e-01, 5.73312751e-01, 0.00000000e+00],
       [0.00000000e+00, 2.90140338e-03, 9.97098597e-01],
       [6.48318659e-01, 3.45328771e-01, 6.35257031e-03],
       [9.66967498e-01, 3.30325024e-02, 0.00000000e+00],
       [9.76621515e-01, 2.33784847e-02, 0.00000000e+00],
       [0.00000000e+00, 2.90140338e-03, 9.97098597e-01],
       [0.00000000e+00, 1.65137890e-04, 9.99834862e-01],
       [3.13415333e-04, 2.35616056e-02, 9.76124979e-01],
       [1.37798672e-03, 9.96584878e-01, 2.03713538e-03],
       [4.91637516e-01, 5.08362484e-01, 0.00000000e+00],
       [5.45146121e-01, 4.54853879e-01, 0.00000000e+00],
       [9.78842918e-01, 2.11570824e-02, 0.00000000e+00],
       [9.88231511e-01, 1.17684886e-02, 0.00000000e+00],
       [7.03861043e-01, 2.96138957e-01, 0.00000000e+00],
       [9.43297736e-01, 5.67022640e-02, 0.00000000e+00],
       [9.90806233e-01, 9.19376