In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
iris_df = pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")

In [3]:
iris_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal.length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal.width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal.length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal.width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


#### Training PipeLine

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Training Algorithm libraries

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
iris_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [7]:
X = iris_df.drop("variety", axis=1)
y = iris_df.variety

In [8]:
y.unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [9]:
iris_df.shape

(150, 5)

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
transformed_y = label_encoder.fit_transform(y)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, random_state=42, shuffle=True, stratify=transformed_y)

In [12]:
print("Dataset for both X_train and y_train")
display(X_train)
print("----" * 20)
display(y_train)

Dataset for both X_train and y_train


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
130,7.4,2.8,6.1,1.9
122,7.7,2.8,6.7,2.0
81,5.5,2.4,3.7,1.0
71,6.1,2.8,4.0,1.3
89,5.5,2.5,4.0,1.3
...,...,...,...,...
49,5.0,3.3,1.4,0.2
21,5.1,3.7,1.5,0.4
45,4.8,3.0,1.4,0.3
92,5.8,2.6,4.0,1.2


--------------------------------------------------------------------------------


array([2, 2, 1, 1, 1, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 2, 0, 0, 1, 1, 1,
       0, 1, 2, 0, 2, 1, 2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1, 1, 2, 1, 2, 0, 1,
       0, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 0, 2, 2, 0, 0, 2,
       2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0, 0, 0, 0,
       1, 0])

In [13]:
print("Dataset for both X_train and y_train")
display(X_test)
print("----" * 20)
display(y_test)

Dataset for both X_train and y_train


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
42,4.4,3.2,1.3,0.2
56,6.3,3.3,4.7,1.6
99,5.7,2.8,4.1,1.3
53,5.5,2.3,4.0,1.3
38,4.4,3.0,1.3,0.2
85,6.0,3.4,4.5,1.6
134,6.1,2.6,5.6,1.4
141,6.9,3.1,5.1,2.3
107,7.3,2.9,6.3,1.8
132,6.4,2.8,5.6,2.2


--------------------------------------------------------------------------------


array([0, 1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 1, 0, 1, 2, 1,
       2, 1, 2, 1, 0, 2, 0, 1, 2, 2, 0, 0, 0, 0, 2, 1])

In [14]:
X_train.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width'], dtype='object')

In [15]:
num_feats = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width'] # Features to be transformed
num_pipeline = Pipeline( # The pipeline that contains everything to be done on each of the features
    steps= [
        ("impute", SimpleImputer(strategy='mean')), #Handles missing values
        ("scaler", StandardScaler()) #Scale your data, so that the mean becomes 0 and std becomes 1
    ]
)

preprocessor = ColumnTransformer( # This is for composure
    transformers= [
        ("", num_pipeline, num_feats), # First param is str, 2nd param is the pipeline, and the last param is the columns to
        #be transformed
    ]
)

In [16]:
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [17]:
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

X_train.head(2)

Unnamed: 0,__sepal.length,__sepal.width,__petal.length,__petal.width
0,1.792138,-0.60238,1.315323,0.920954
1,2.145311,-0.60238,1.653204,1.051355


In [18]:
X_test.head(2)

Unnamed: 0,__sepal.length,__sepal.width,__petal.length,__petal.width
0,-1.739583,0.303212,-1.387726,-1.295856
1,0.497174,0.52961,0.526934,0.529752


In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [32]:
def model_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, train_pred)
    test_acc  = accuracy_score(y_test, test_pred)

    return train_acc, test_acc, model

rf_model = RandomForestClassifier()

# train_score, test_score = model_train(rf_model, X_train, X_test, y_train, y_test)
# print(train_score, ",", test_score)


In [25]:
X_train, X_val = X_train[:90], X_train[90:]
y_train, y_val = y_train[:90], y_train[90:]

---
Doing some hyperparameter tuning

---

In [33]:
param_grid = {
    "n_estimators" : [100, 200],
    "max_depth" : range(1, 3),
    "criterion" : ["gini", "entropy"],
    "min_samples_split" : [2,3]
}

grid_search = GridSearchCV(rf_model, param_grid, n_jobs=-1, cv=5)
grid_search.fit(X_train, y_train)

In [34]:
train_score, val_score, model= model_train(grid_search.best_estimator_, X_train, X_val, y_train, y_val)

In [35]:
train_score, val_score

(0.9888888888888889, 0.9545454545454546)

In [37]:
model.score(X_test, y_test)

0.9210526315789473

In [38]:
import pickle

with open("classifier.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

In [41]:
with open("classifier.pkl", "rb") as pickled_model:
    classifier = pickle.load(pickled_model)
pred = classifier.predict([[-1.739583,0.303212, -1.387726, -1.295856]])
print(pred[0])

0




In [40]:
y_test

array([0, 1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 1, 0, 1, 2, 1,
       2, 1, 2, 1, 0, 2, 0, 1, 2, 2, 0, 0, 0, 0, 2, 1])