# Model

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set_theme(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Iris"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

ModuleNotFoundError: No module named 'seaborn'

In [27]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [28]:
df = pd.read_pickle(f"{ROOT}/data/iris.pickle")
df.head()

Unnamed: 0,Sepal_Length_(Cm),Sepal_Width_(Cm),Petal_Length_(Cm),Petal_Width_(Cm),Target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [29]:
X = df.drop('Target', axis=True)
X.head()

Unnamed: 0,Sepal_Length_(Cm),Sepal_Width_(Cm),Petal_Length_(Cm),Petal_Width_(Cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [30]:
y = df.Target
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Target, dtype: int64

In [31]:
y.value_counts()

Target
0    50
1    50
2    50
Name: count, dtype: int64

### Split

In [32]:
from sklearn.model_selection import train_test_split 

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = SEED)

In [34]:
y_train.value_counts()

Target
0    42
2    40
1    38
Name: count, dtype: int64

### Model Selection

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [36]:
classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN(3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5),
    "RF" : RandomForestClassifier()
}

In [37]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

for name, model in classifiers.items():
    
    model.fit(X_train, y_train)
    
    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred)
    
    # Scoring on UNSEEN data - important
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name:20s} accuracy\ttrain = {train_accuracy:.2%} \ttest = {test_accuracy:.2%}")

KNN                  accuracy	train = 95.83% 	test = 100.00%
KNN(3)               accuracy	train = 95.00% 	test = 100.00%
DT                   accuracy	train = 100.00% 	test = 100.00%
DT(max_depth=5)      accuracy	train = 100.00% 	test = 100.00%
RF                   accuracy	train = 100.00% 	test = 100.00%


In [38]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
for name, model in classifiers.items():
    
    model.fit(X_train, y_train)
    
    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred)
    
    # Scoring on UNSEEN data - important
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name:20s} accuracy\ttrain = {train_accuracy:.2%} \ttest = {test_accuracy:.2%}")

KNN                  accuracy	train = 94.17% 	test = 100.00%
KNN(3)               accuracy	train = 95.00% 	test = 100.00%
DT                   accuracy	train = 100.00% 	test = 100.00%
DT(max_depth=5)      accuracy	train = 100.00% 	test = 100.00%
RF                   accuracy	train = 100.00% 	test = 100.00%
