# Logistic Regression: Classification Problem

# Data Gathering

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:

path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/iris.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.shape

(150, 5)

In [6]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [7]:
# check for missing data
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [8]:
# check for duplicated rows
df.duplicated().sum()

np.int64(1)

# Separate X and Y features
    Y : Species X : all remaining features, sepal length/width, petal length/width

In [9]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [10]:
X = df.drop(columns=['species'])
Y = df[['species']]

In [11]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [12]:
Y.head()

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


# Data preprocessing and data cleaning 

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [14]:
pre = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
).set_output(transform='pandas')

In [15]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


# Train Test Split
    rate at which you can divide:

    100%

    60 % training 40% testing

    70% training 30% testing

    80% training 20% testing

In [16]:
from sklearn.model_selection import train_test_split

random_state = this is generally used for reproducing model results

In [17]:
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.7,test_size=0.3,random_state=21)

In [18]:
xtrain.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
36,-0.41601,1.019004,-1.397064,-1.315444
37,-1.143017,1.249201,-1.340227,-1.447076
13,-1.870024,-0.131979,-1.510739,-1.447076
68,0.432165,-1.973554,0.421734,0.395774
11,-1.264185,0.788808,-1.226552,-1.315444


In [19]:
xtrain.index

Index([ 36,  37,  13,  68,  11,  52,  49,  65, 118,  55,
       ...
       122,  61, 110,  72,  98, 120, 112,  48,   4,  56],
      dtype='int64', length=105)

In [20]:
xtest.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
92,-0.052506,-1.052767,0.137547,0.000878
44,-0.900681,1.709595,-1.056039,-1.05218
7,-1.021849,0.788808,-1.283389,-1.315444
21,-0.900681,1.479398,-1.283389,-1.05218
95,-0.173674,-0.131979,0.251221,0.000878


In [21]:
ytrain.head()

Unnamed: 0,species
36,setosa
37,setosa
13,setosa
68,versicolor
11,setosa


In [22]:
ytest.head()

Unnamed: 0,species
92,versicolor
44,setosa
7,setosa
21,setosa
95,versicolor


# build the model

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression()
model.fit(xtrain,ytrain)

In [25]:
model.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [26]:
ypred_train = model.predict(xtrain)

In [27]:
yprob_train = model.predict_proba(xtrain)

In [28]:
yprob_train[0]

array([9.66462322e-01, 3.35371230e-02, 5.54627639e-07])

In [29]:
model.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [30]:
ypred_train[:5]

array(['setosa', 'setosa', 'setosa', 'versicolor', 'setosa'], dtype=object)

In [31]:
ytrain.head()

Unnamed: 0,species
36,setosa
37,setosa
13,setosa
68,versicolor
11,setosa


# Model Evaluation

In [32]:
model.score(xtrain,ytrain)

0.9809523809523809

In [33]:
model.score(xtest,ytest)

0.9333333333333333