### Classification Model Comparison

- Dataset: iris.csv
- Learning Date: 17-Oct-23
- Learning from: Prasert Kanawattanachai (CBS)
    - Github: https://github.com/prasertcbs/

In [1]:
# import libraries

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn import metrics

In [2]:
# read csv dat to a dataframe

df = pd.read_csv('https://github.com/prasertcbs/basic-dataset/raw/master/iris.csv')
df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
24,4.8,3.4,1.9,0.2,setosa
118,7.7,2.6,6.9,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
145,6.7,3.0,5.2,2.3,virginica
112,6.8,3.0,5.5,2.1,virginica


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
df.shape

(150, 5)

In [5]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [6]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [7]:
# split data to X and y

X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
y = df['species']
y.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [9]:
# plit train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 7) # default test size is 25%
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

In [10]:
type(X_train), type(X_test), type(y_train), type(y_test)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [12]:
kn = KNeighborsClassifier()     # step 1: choose model/estimator
kn.fit(X_train, y_train)        # step 2: fit/train a model
y_pred_kn = kn.predict(X_test)  # step 3: predict y
kn.score(X_test, y_test)        # step 4: get a score

0.9111111111111111

In [13]:
lg = LogisticRegression()       # step 1: choose model/estimator
lg.fit(X_train, y_train)        # step 2: fit/train a model
y_pred_lg = lg.predict(X_test)  # step 3: predict y
lg.score(X_test, y_test)        # step 4: get a score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9111111111111111

In [14]:
gauss = GaussianNB()                    # step 1: choose model/estimator
gauss.fit(X_train, y_train)             # step 2: fit/train a model
y_pred_gauss = gauss.predict(X_test)    # step 3: predict y
gauss.score(X_test, y_test)             # step 4: get a score

0.8888888888888888

In [15]:
# work with multiple models/estimators

# list of lists for models
algo = [
    [KNeighborsClassifier(n_neighbors = 10), 'KNeighborsClassifier'],
    [LogisticRegression(solver = 'lbfgs'), 'LogisticRegression'],
    [GaussianNB(), 'GaussianNB'],
    [GradientBoostingClassifier(), 'GradientBoostingClassifier'],
    [RandomForestClassifier(), 'RandomForestClassifier'],
    [AdaBoostClassifier(), 'AdaBoostClassifier']
]

algo

[[KNeighborsClassifier(n_neighbors=10), 'KNeighborsClassifier'],
 [LogisticRegression(), 'LogisticRegression'],
 [GaussianNB(), 'GaussianNB'],
 [GradientBoostingClassifier(), 'GradientBoostingClassifier'],
 [RandomForestClassifier(), 'RandomForestClassifier'],
 [AdaBoostClassifier(), 'AdaBoostClassifier']]

In [16]:
model_score = [] # empty list to store score

for m in algo:
    model = m[0]                            # step 1: choose model/estimator
    model.fit(X_train, y_train)             # step 2: fit/train a model
    y_pred = model.predict(X_test)          # step 3: predict y
    score = model.score(X_test, y_test)
    model_score.append([score, m[1]])

    print(f'{m[1]} score = {score}')        # step 4: get a score
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    print('-' * 100)

print(model_score)

KNeighborsClassifier score = 0.9555555555555556
[[12  0  0]
 [ 0 16  0]
 [ 0  2 15]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.89      1.00      0.94        16
   virginica       1.00      0.88      0.94        17

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

----------------------------------------------------------------------------------------------------
LogisticRegression score = 0.9111111111111111
[[12  0  0]
 [ 0 14  2]
 [ 0  2 15]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.88      0.88      0.88        16
   virginica       0.88      0.88      0.88        17

    accuracy                           0.91        45
   macro avg       0.92      0.92      0.92        45
weighted avg       0.91 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
dscore = pd.DataFrame(model_score, columns = ['score', 'classifier'])
dscore

Unnamed: 0,score,classifier
0,0.955556,KNeighborsClassifier
1,0.911111,LogisticRegression
2,0.888889,GaussianNB
3,0.888889,GradientBoostingClassifier
4,0.911111,RandomForestClassifier
5,0.866667,AdaBoostClassifier


In [18]:
dscore.sort_values('score', ascending = False)

Unnamed: 0,score,classifier
0,0.955556,KNeighborsClassifier
1,0.911111,LogisticRegression
4,0.911111,RandomForestClassifier
2,0.888889,GaussianNB
3,0.888889,GradientBoostingClassifier
5,0.866667,AdaBoostClassifier
