<a href="https://www.kaggle.com/code/mnik55/iris-species-classification?scriptVersionId=202978995" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

In [None]:
csv_path = "/kaggle/input/iris/Iris.csv"

df = pd.read_csv(csv_path)
df.sample(3)

In [None]:
df.info()

In [None]:
df['Species'].value_counts()

In [None]:
df = df.drop(['Id'], axis=1)

In [None]:
df = df.rename(columns={
    'SepalLengthCm': 'SepalL',
    'SepalWidthCm': 'SepalW',
    'PetalLengthCm': 'PetalL',
    'PetalWidthCm': 'PetalW'
})

In [None]:
df.columns.values[:-1]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
axes = axes.flatten()

cat_cols = df.columns.values[:-1]

for i in range(0, len(axes)):
    # sns.violinplot(y=cat_cols[i], x='Species', data=df, ax=axes[i])
    sns.boxplot(y=cat_cols[i], x='Species', data=df, ax=axes[i])

plt.tight_layout()
plt.show()

In [None]:
df.shape

In [None]:
df.corr(numeric_only=True)

In [None]:
df = df.sample(frac=1, random_state=2).reset_index(drop=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ord = OrdinalEncoder()
df.loc[:, 'Species'] = pd.DataFrame(ord.fit_transform(df[['Species']]), columns=[['Species']])

In [None]:
y = df['Species']
x = df.drop(['Species'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

# print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

rfc = GradientBoostingClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

# print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
import xgboost as xgb

rfc = xgb.XGBClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

# print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.svm import SVC

rfc = SVC()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

# print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression

rfc = LogisticRegression()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

# print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import cross_val_score

model = GradientBoostingClassifier()

cross_val_score(model, x, y, cv=10).mean()

In [None]:
model.fit(x_train, y_train)

In [None]:
x_min, x_max = df['PetalW'].min()-1, df['PetalW'].max()+1
y_min, y_max = df['PetalL'].min()-1, df['PetalL'].max()+1

sepal_l_mean = df['SepalL'].mean()
sepal_w_mean = df['SepalW'].mean()

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
region = model.predict(np.c_[np.full(xx.ravel().shape, sepal_l_mean), np.full(xx.ravel().shape, sepal_w_mean), yy.ravel(), xx.ravel()]).reshape(xx.shape)

In [None]:
np.full((2,3), 4)

In [None]:
region.shape

In [None]:
xx.shape

In [None]:
plt.contourf(xx, yy, region, cmap='viridis', alpha=0.5)
sns.scatterplot(x='PetalW', y='PetalL', hue='Species', data=df)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.kdeplot(x='PetalL', data=df, ax=axes[0])
sns.kdeplot(x=np.log1p(df['PetalL']), ax=axes[1])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.kdeplot(x='PetalW', data=df, ax=axes[0])
sns.kdeplot(x=np.log1p(df['PetalW']), ax=axes[1])

In [None]:
sns.kdeplot(x='SepalL', data=df)

In [None]:
sns.kdeplot(x='SepalW', data=df)

In [None]:
x_min, x_max = df['SepalW'].min()-1, df['SepalW'].max()+1
y_min, y_max = df['SepalL'].min()-1, df['SepalL'].max()+1

# Mean is not good here as seen from distribution (bimodial distribution)
petal_l_mean = 5.5
petal_w_mean = 1.75

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
region = model.predict(np.c_[yy.ravel(), xx.ravel(), np.full(xx.ravel().shape, petal_l_mean), np.full(xx.ravel().shape, petal_w_mean)]).reshape(xx.shape)

In [None]:
print(df['PetalL'].mean())
print( df['PetalW'].mean())

In [None]:
print(df['PetalL'].median())
print( df['PetalW'].median())

In [None]:
xx.shape

In [None]:
region.shape

In [None]:
region.sum()

In [None]:
56*44

In [None]:
plt.contour(xx, yy, region, cmap='viridis', alpha=0.5)
sns.scatterplot(x='SepalW', y='SepalL', hue='Species', data=df)

In [None]:
from sklearn.model_selection import cross_val_score

model = GradientBoostingClassifier()

cross_val_score(model, x.drop(['SepalL', 'SepalW'], axis=1), y, cv=10).mean()

In [None]:
sns.pairplot(df, hue='Species', palette='plasma')

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
from sklearn.model_selection import cross_val_score

model = GradientBoostingClassifier()

cross_val_score(model, x.drop(['PetalL', 'SepalL'], axis=1), y, cv=10).mean()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

scores = []

for k in range(1, 15):
    model = KNeighborsClassifier(n_neighbors=k)
    scores.append(cross_val_score(model, x, y, cv=10).mean())

sns.lineplot(x=range(1, 15), y=scores)