In [None]:
# for data analysis
import numpy as np
import pandas as pd
# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h1>Exploring the Dataset<h1>

In [None]:
iris_df= pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
iris_df.head(10)

In [None]:
iris_df.tail(10)

In [None]:
iris_df.sample(5)

In [None]:
# statistical summary
iris_df.describe()

In [None]:
# data description
iris_df.info()

<h1>Data Visualisation<h1>

In [None]:
# plotting sepal length/width graph
# sepal length
iris_df['sepal_length'].plot(kind= 'line', legend= True, marker= '*', color= 'green', figsize= (10, 8), grid=True)
# sepal widht
iris_df['sepal_width'].plot(kind= 'line', legend= True, marker= 'o', color= 'blue', figsize= (10, 8), grid=True)
plt.title('Sepal Length and Width')
plt.xlabel('Count')
plt.ylabel('Length/Width')

The graph shows that sepal length is greater than sepal width

In [None]:
# plotting petal length/width graph
# petal length
iris_df['petal_length'].plot(kind= 'line', legend= True, marker= '*', color= 'black', figsize= (10, 8), grid=True)
# petal width
iris_df['petal_width'].plot(kind= 'line', legend= True, marker= 'o', color= 'red', figsize= (10, 8), grid=True)
plt.title('Petal Length and Width')
plt.xlabel('Count')
plt.ylabel('Length/Width')

The graph shows that petal length is greater than petal width

In [None]:
# merging both graphs in one plot
# sepal length
iris_df['sepal_length'].plot(kind= 'line', legend= True, marker= '*', color= 'green', figsize= (10, 8), grid=True)
# sepal widht
iris_df['sepal_width'].plot(kind= 'line', legend= True, marker= 'o', color= 'blue', figsize= (10, 8), grid=True)
# petal length
iris_df['petal_length'].plot(kind= 'line', legend= True, marker= '*', color= 'black', figsize= (10, 8), grid=True)
# petal width
iris_df['petal_width'].plot(kind= 'line', legend= True, marker= 'o', color= 'red', figsize= (10, 8), grid=True)
plt.title('Sepal and Petal Length/Width')
plt.xlabel('Count')
plt.ylabel('Length/Width')

Now we will determine the impact of features on the 'Species' variable.

In [None]:
# creatig a pairplot
sns.pairplot(iris_df, hue= 'species', kind= 'hist')
plt.figure(figsize= (10, 8))

In [None]:
# different plot to visualise the same impact
sns.pairplot(iris_df, hue= 'species', kind= 'kde')
plt.figure(figsize= (10, 8))

In [None]:
# using reg plot to visualise correlation 
sns.pairplot(iris_df, hue= 'species', kind= 'reg')
plt.figure(figsize= (10, 8))

In [None]:
# scatter plot to visualise the distribution of data
sns.set_style("whitegrid")
sns.pairplot(iris_df, hue= 'species', kind= 'scatter')
plt.figure(figsize= (10, 8))

In [None]:
# plotting a pie chart
plt.figure(figsize= (10, 8))
iris_df['species'].value_counts().plot(kind= 'pie', autopct= '%0.2f%%', ylabel='', title= 'Species')

In [None]:
# relationship between sepal length and width of all species using scatter plot
plt.figure(figsize= (10, 8))
sns.scatterplot(data= iris_df, x= 'sepal_length', y= 'sepal_width', hue= 'species')
plt.title('Sepal Length and Width')
plt.xlabel('Length')
plt.ylabel('Width')
plt.show()

In [None]:
# relationship between petal length and width of all species using scatter plot
plt.figure(figsize= (10, 8))
sns.scatterplot(data= iris_df, x= 'petal_length', y= 'petal_width', hue= 'species')
plt.title('Petal Length and Width')
plt.xlabel('Length')
plt.ylabel('Width')
plt.show()

In [None]:
# subplots for sepal width and length
fig, axes= plt.subplots(1, 2, figsize= (10, 8))
# Box plot for sepal width
sns.boxplot(data= iris_df, x= 'species', y= 'sepal_width', ax= axes[0])
axes[0].set_title('Sepal Width By Species')
axes[0].set_xlabel('Species')
axes[0].set_ylabel('Sepal Width')
# Box plot for sepal length
sns.boxplot(data= iris_df, x= 'species', y= 'sepal_length', ax=axes[1])
axes[1].set_title('Sepal Length By Species')
axes[1].set_xlabel('Species')
axes[1].set_ylabel('Sepal Length')
plt.show()

In [None]:
# subplots for petal width and length
fig, axes= plt.subplots(1, 2, figsize= (10, 8))
# Box plot for petal width
sns.boxplot(data= iris_df, x= 'species', y= 'petal_width', ax= axes[0])
axes[0].set_title('Petal Width By Species')
axes[0].set_xlabel('Species')
axes[0].set_ylabel('Petal Width')
# Box plot for petal length
sns.boxplot(data= iris_df, x= 'species', y= 'petal_length', ax= axes[1])
axes[1].set_title('Petal Length By Species')
axes[1].set_xlabel('Species')
axes[1].set_ylabel('Petal Length')
plt.show()

From the box plot, it's easier to tell the outliers, and how the data is distributed. 
In this case, the box plot tells us that the Sepal Length and Petal Length has more outliers(extreme values) than the respective widths.

In [None]:
# distribution plots for the combination of different features
plt.figure(figsize=(8, 5))
# sepal length
sns.displot(data= iris_df, x= 'sepal_length', label= 'Sepal Length', kde= True)
plt.title('Distribution of Iris Flower Features')
plt.xlabel('Values')
plt.ylabel('Distribution')
plt.legend()
# sepal width
sns.displot(data= iris_df, x= 'sepal_width', label= 'Sepal Width', kde= True)
plt.xlabel('Values')
plt.ylabel('Distrubution')
plt.legend()
# petal length
sns.displot(data= iris_df, x= 'petal_length', label= 'Petal Length', kde= True)
plt.xlabel('Values')
plt.ylabel('Distrubution')
plt.legend()
# petal width
sns.displot(data= iris_df, x= 'petal_width', label= 'Petal Width', kde= True)
plt.xlabel('Values')
plt.ylabel('Distrubution')
plt.legend()
plt.show()

The graph of 'Sepal Width' is a bell-shaped graph and follows normal distribution. 

<h1>Machine Learning Model<h1>

In [None]:
# first we will convert our categorical data to numerical data using label encoder
le_= LabelEncoder()
iris_df['species']= le_.fit_transform(iris_df['species']) # transform data
iris_df

In [None]:
species_features= iris_df.iloc[:,:4] # 4 features(petal and sepal length/width)
species_features

Splitting dataset into training and testing sets.

In [None]:
x_train, x_test, y_train, y_test= train_test_split(species_features, iris_df['species'], test_size= 0.2, random_state= 3)

<h2>Logistic Regression</h2>

In [None]:
lr_= LogisticRegression(solver= 'lbfgs', max_iter= 1000)
lr_.fit(x_train, y_train) # fit model on training sets
y_predictor= lr_.predict(x_test)
lr_.score(x_test, y_test) * 100

<h3>Scaling<h3>

In [None]:
min_max_model= MinMaxScaler()
X_train= min_max_model.fit_transform(x_train) 
X_test = min_max_model.transform(x_test)
lr_.fit(X_train, y_train)
LogisticRegression(max_iter= 1000)
y_predictor= lr_.predict(X_test)
lr_.score(X_test, y_test)*100

Before feature scaling, Logistic Regression had a 100% accuracy score.

<h3>KNeighbors Classifier</h3>

In [None]:
knn= KNeighborsClassifier()
knn.fit(X_train, y_train)
y_predictor= knn.predict(X_test)
knn.score(X_test, y_test) * 100

<h3>Support Vector Machine</h3>

In [None]:
svc= SVC()
svc.fit(X_train, y_train)
y_predictor= svc.predict(X_test)
svc.score(X_test, y_test) * 100

SVC has the highest accuracy score.