# Iris dataset - Logistic Regression (binary classifier)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
ird=pd.read_csv("ird.csv")

In [3]:
ird

Unnamed: 0,slno,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2


In [4]:
ird.Species.unique()

array([0, 1, 2], dtype=int64)

In [5]:
# define features and outcomes

x = ird.iloc[:,1:-1]  # colon means all rows, in columns 1 (0 index) to second to last
y = ird.Species

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 0)

In [7]:
x_test

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
114,5.8,2.8,5.1,2.4
62,6.0,2.2,4.0,1.0
33,5.5,4.2,1.4,0.2
107,7.3,2.9,6.3,1.8
7,5.0,3.4,1.5,0.2
100,6.3,3.3,6.0,2.5
40,5.0,3.5,1.3,0.3
86,6.7,3.1,4.7,1.5
76,6.8,2.8,4.8,1.4
71,6.1,2.8,4.0,1.3


In [8]:
# Do we need to standardise the data? - no, it is all in the same scale of measurement

# can help to do even if all in same scale 

from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
stdsc.fit(x_train)
x_train_std = stdsc.transform(x_train)
x_test_std = stdsc.transform(x_test)

In [9]:
# Build the model

from sklearn.linear_model import LogisticRegression

In [10]:
model = LogisticRegression()

In [11]:
model.fit(x_train_std, y_train)

LogisticRegression()

In [12]:
# check score
print("Train score: ", model. score(x_train_std,y_train))

print("Test score: ", model. score(x_test_std,y_test))

Train score:  0.9583333333333334
Test score:  1.0


In [13]:
y_train_pred = model.predict(x_train_std)
y_test_pred = model.predict(x_test_std)


In [14]:
y_train_pred

array([2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0, 1, 2, 2, 2, 2, 1, 2,
       1, 1, 2, 1, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 0,
       1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2,
       0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 2, 0, 1, 2, 2, 0, 1,
       1, 2, 1, 0, 0, 0, 2, 1, 2, 0], dtype=int64)

In [15]:
pd.crosstab(y_train, y_train_pred)

col_0,0,1,2
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39,0,0
1,0,34,3
2,0,2,42


In [16]:
x_train.shape[0]  # shape of the series, how many rows (120) in the train data (out of 150)

120

In [17]:
(39+34+42)/x_train.shape[0]  # model score of train data set

0.9583333333333334

# New dataset - practice exercise, banking investment customer data

In [18]:
bank = pd.read_csv("banking investment case study.csv")  

In [137]:
# Tutors code to confirm Gold_Fund is binary

import numpy as np
bank['Gold_Fund']=np.where(bank.Gold_Fund=="No",0,1) # change Gold_Fund to binary data if not already

In [138]:
bank

Unnamed: 0,ID,Gender,AMB,AccountSince,RelationshipSize,AvgMonthlyTxn,DematAccounts,FD_Nos,MF_nos,Equity_mfs,Debt_mfs,Balanced_Funds,ELSS,PMS_accounts,Trading_Accounts,Speciality_funds,Gold_Fund
0,25,1,297,22,2,14,0,1,1,0,0,0,0,0,0,0,0
1,46,1,138,56,7,22,2,1,2,0,1,0,1,1,0,0,0
2,47,1,228,2,1,2,0,0,0,0,0,0,0,0,0,0,0
3,61,1,190,16,1,16,0,0,0,0,0,0,1,0,0,0,0
4,79,1,187,14,1,14,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,12394,1,147,14,1,14,0,0,0,0,0,1,0,0,0,0,1
870,12407,0,188,30,2,28,0,1,1,0,0,0,0,0,0,0,0
871,12424,1,23,10,1,10,0,0,0,0,0,0,0,0,0,0,0
872,12427,1,43,22,2,16,1,0,0,0,1,0,0,0,0,0,0


In [139]:
x = bank.iloc[:, 1: -1]   # observing patterns in all data to predict gold fund owners
y = bank.Gold_Fund   # predicting if a customer will have a gold fund?

In [140]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 0)

In [141]:
x_train

Unnamed: 0,Gender,AMB,AccountSince,RelationshipSize,AvgMonthlyTxn,DematAccounts,FD_Nos,MF_nos,Equity_mfs,Debt_mfs,Balanced_Funds,ELSS,PMS_accounts,Trading_Accounts,Speciality_funds
54,1,302,20,5,6,0,0,1,0,0,1,0,0,0,0
45,1,135,60,8,16,3,1,0,1,0,2,1,0,0,2
172,0,135,26,4,16,0,0,2,0,1,0,1,0,0,0
538,1,259,8,2,4,0,0,0,0,0,0,0,0,0,0
819,1,302,8,1,8,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0,293,14,2,12,1,0,0,0,0,0,0,0,0,0
192,0,192,12,1,12,0,0,0,0,0,0,0,0,0,0
629,1,393,26,7,2,1,0,0,0,0,2,0,0,0,0
559,1,299,38,7,12,1,2,1,0,1,0,0,0,0,0


In [142]:
# Do we need to standardise the data? - no, it is all in the same scale of measurement

# can help to do even if all in same scale 

stdsc = StandardScaler()
stdsc.fit(x_train)
x_train_std = stdsc.transform(x_train)
x_test_std = stdsc.transform(x_test)

In [143]:
model1 = LogisticRegression()

In [144]:
model1.fit(x_train_std, y_train)

LogisticRegression()

In [145]:
# check score
print("Train score: ", model1. score(x_train_std,y_train))

print("Test score: ", model1. score(x_test_std,y_test))

Train score:  0.932761087267525
Test score:  0.92


In [146]:
y_train_pred = model1.predict(x_train_std)
y_test_pred = model1.predict(x_test_std)


In [147]:
pd.crosstab(y_train, y_train_pred)

col_0,0,1
Gold_Fund,Unnamed: 1_level_1,Unnamed: 2_level_1
0,637,4
1,43,15


# y_train_pred = bank --- Look at his sheet for recall

In [148]:
(637 + 15)/x_train.shape[0]   # score for training model (correct predictions of model)

0.932761087267525

In [149]:
y_train_pred = model1.predict(x_train_std)

In [150]:
# Test precision and recall

from sklearn.metrics import precision_score, recall_score

In [151]:
precision_score(y_train,y_train_pred)

0.7894736842105263

In [152]:
recall_score(y_train, y_train_pred)

0.25862068965517243

In [153]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))


              precision    recall  f1-score   support

           0       0.94      0.99      0.96       641
           1       0.79      0.26      0.39        58

    accuracy                           0.93       699
   macro avg       0.86      0.63      0.68       699
weighted avg       0.92      0.93      0.92       699



# Working with imbalanced data

In [154]:
# Oversampling - need extra data, taking extra datapoints from the same data

# multiple instances of the same datapoint.

# Need less data, undersampling - taking random samples from a dataset to make it equal 
# to the size of another

In [155]:
# Oversampling Technique

# Separate library for this - imblearn 



from imblearn.over_sampling import RandomOverSampler 



In [156]:
oversampler = RandomOverSampler()

In [157]:
x_over, y_over = oversampler.fit_resample(x, y)

In [158]:
y.value_counts()

0    803
1     71
Name: Gold_Fund, dtype: int64

In [159]:
y_over.value_counts()

0    803
1    803
Name: Gold_Fund, dtype: int64

In [160]:
# build a model with the oversampled data, see if precision improves

In [161]:
x_over_train, x_over_test, y_over_train, y_over_test = train_test_split(x_over, y_over, test_size=0.20, random_state=0)

In [162]:
# Do we need to standardise the data? - no, it is all in the same scale of measurement

# can help to do even if all in same scale 

from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
stdsc.fit(x_train)
x_over_train_std = stdsc.transform(x_over_train)
x_over_test_std = stdsc.transform(x_over_test)

In [163]:
model2 = LogisticRegression()

In [164]:
model2.fit(x_over_train_std, y_over_train)

LogisticRegression()

In [169]:
# check score
print("Train score: ", model2.score(x_over_train_std,y_over_train))

print("Test score: ", model2.score(x_over_test_std,y_over_test))

Train score:  0.7616822429906542
Test score:  0.7484472049689441


In [170]:
y_over_train_pred = model2.predict(x_over_train_std)
y_over_test_pred = model2.predict(x_over_test_std)

In [171]:
pd.crosstab(y_over_train, y_over_train_pred)

col_0,0,1
Gold_Fund,Unnamed: 1_level_1,Unnamed: 2_level_1
0,504,135
1,171,474


In [172]:
from sklearn.metrics import classification_report
print(classification_report(y_over_train, y_over_train_pred))

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       639
           1       0.78      0.73      0.76       645

    accuracy                           0.76      1284
   macro avg       0.76      0.76      0.76      1284
weighted avg       0.76      0.76      0.76      1284



In [173]:
# Crosstab shows we have more recall - lose accuracy as a result though

# Over and under sampling could wander into changing the model to much to fit the data for results
# rather than usig a less accurate model (precise and recall) for the data we have 
# overfitting the model to the data 

In [174]:
# Underfitting

In [175]:
from imblearn.under_sampling import RandomUnderSampler 

In [176]:
# Do under fitting example 

# Smote

# Another dataset, new example

In [177]:
# new sheert