# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor


import matplotlib.pyplot as plt
import seaborn as sns

# Problem Statement

In [None]:
To predict SepalLengthCm values by using various independent variables:

Dependent Variable :  SepalLengthCm
Independent Varaible : 
    

# 2. Data Gathering

In [2]:
df = pd.read_csv("Iris.csv")
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


# 3. Exploratory Data Analysis

## 3.1 Id

In [4]:
df["Id"]

0        1
1        2
2        3
3        4
4        5
      ... 
145    146
146    147
147    148
148    149
149    150
Name: Id, Length: 150, dtype: int64

In [5]:
df["Id"].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150], dtype=int64)

In [6]:
df["Id"].nunique()

150

In [7]:
df["Id"].isnull().sum()

0

## 3.2 SepalWidthCm

In [8]:
df["SepalWidthCm"]

0      3.5
1      3.0
2      3.2
3      3.1
4      3.6
      ... 
145    3.0
146    2.5
147    3.0
148    3.4
149    3.0
Name: SepalWidthCm, Length: 150, dtype: float64

In [9]:
df["SepalWidthCm"].nunique()

23

In [10]:
df["SepalWidthCm"].isnull().sum()

0

## 3.3 PetalLengthCm

In [11]:
df["PetalLengthCm"]

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: PetalLengthCm, Length: 150, dtype: float64

In [12]:
df["PetalLengthCm"].nunique()

43

##  3.4 PetalWidthCm

In [13]:
df["PetalWidthCm"].nunique()

22

## 3.5 Species

In [14]:
df["Species"]

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

In [15]:
df["Species"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [16]:
df["Species"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [16]:
df["Species"].isnull().sum()

0

High   Low

20     80   >> Imbalanced Data
50     50   >> Balanced Data

# 4. Feature Engineering

In [17]:
df.drop("Id", axis=1, inplace = True)

In [18]:
df["Species"].value_counts().to_dict()

{'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50}

In [19]:
# df["Species"].replace({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}, inplace = True)

In [20]:
df = pd.get_dummies(df, columns = ["Species"], prefix = "", prefix_sep = "")

In [21]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Iris-setosa,Iris-versicolor,Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SepalLengthCm    150 non-null    float64
 1   SepalWidthCm     150 non-null    float64
 2   PetalLengthCm    150 non-null    float64
 3   PetalWidthCm     150 non-null    float64
 4   Iris-setosa      150 non-null    uint8  
 5   Iris-versicolor  150 non-null    uint8  
 6   Iris-virginica   150 non-null    uint8  
dtypes: float64(4), uint8(3)
memory usage: 5.3 KB


In [23]:
x = df.drop("SepalLengthCm", axis = 1)
y = df["SepalLengthCm"]

In [24]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=10)

In [25]:
norm_scaler = MinMaxScaler()
norm_scaler.fit(x_train)
scaled_array = norm_scaler.transform(x_train)

In [46]:
file = open("normal_scaler.obj","wb")
pickle.dump(norm_scaler, file)
file.close()

In [26]:
linear_model = LinearRegression()
linear_model.fit(x_train,y_train)

In [27]:
import pickle

In [28]:
with open("linear_model.pkl", "wb") as file:
    pickle.dump(linear_model, file)

In [29]:
# 'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2

In [32]:
scaled_x_test = norm_scaler.transform(x_test)
scaled_x_test = pd.DataFrame(scaled_x_test)

In [33]:
scaled_x_test.head()

Unnamed: 0,0,1,2,3,4,5
0,0.045455,0.576271,0.5,0.0,1.0,0.0
1,0.227273,0.728814,0.75,0.0,0.0,1.0
2,0.681818,0.084746,0.041667,1.0,0.0,0.0
3,0.363636,0.610169,0.541667,0.0,1.0,0.0
4,0.5,0.067797,0.041667,1.0,0.0,0.0


In [34]:
linear_model.predict(scaled_x_test.head())



array([1.6212492 , 1.40014758, 2.38655709, 1.81156658, 2.27108293])

In [35]:
SepalWidthCm =2.3
PetalLengthCm = 4.4
PetalWidthCm = 1.3
Species = 'Iris-versicolor'

In [36]:
columns_list = x.columns
columns_list

Index(['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica'],
      dtype='object')

In [38]:
file = open("columns_list.obj", "wb")
pickle.dump(columns_list, file)
file.close()

In [39]:
array = np.zeros(len(columns_list))
array

array([0., 0., 0., 0., 0., 0.])

In [40]:
array[0] = SepalWidthCm
array[1] = PetalLengthCm
array[2] = PetalWidthCm
index = np.where(columns_list == Species)[0][0]
print(index)
array[index] = 1

4


In [41]:
array

array([2.3, 4.4, 1.3, 0. , 1. , 0. ])

In [43]:
scaled_array = norm_scaler.transform([array])



In [45]:
prediction = linear_model.predict(scaled_array)
prediction[0]



1.6212491977811783

In [None]:
['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', "Pune", "Mumbai", "Banglore"]

In [None]:
city = "Pune"

In [39]:
city_index = np.where(columns_list == city)[0][0]
array[city_index] = 1