In [47]:
import seaborn as sn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [4]:
# data
data = sn.load_dataset('iris')

## Inspection

In [5]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
data.shape
# 5 columns, 150 rows

(150, 5)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
# copy for target
df = data.copy()

In [10]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [11]:
# view all columns & rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [12]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [13]:
# revert to reasonable viewing size
pd.options.display.max_rows = 20

In [14]:
# assign species categories integers
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [16]:
recode1 = {'setosa' : 0, 'versicolor' : 1, 'virginica' : 2} 
df['speciesR'] = df['species'].map(recode1) 

In [18]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,speciesR
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2
146,6.3,2.5,5.0,1.9,virginica,2
147,6.5,3.0,5.2,2.0,virginica,2
148,6.2,3.4,5.4,2.3,virginica,2


## task is to predict above median sepal length

In [19]:
list(df.columns)

['sepal_length',
 'sepal_width',
 'petal_length',
 'petal_width',
 'species',
 'speciesR']

In [20]:
# drop species since it has been recoded
df = df.drop(['species'], axis = 1)

In [21]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,speciesR
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [22]:
df['sepal_length'].min()

4.3

In [23]:
df['sepal_length'].max()

7.9

In [29]:
df['sepal_length'].median()

5.8

In [26]:
targets = np.where(df['sepal_length'] >
                   df['sepal_length'].median(), 1,0)

In [27]:
df['above_median_sepal_length'] = targets

In [28]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,speciesR,above_median_sepal_length
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0


In [30]:
targets.sum() / targets.shape[0]

0.4666666666666667

In [31]:
data_with_targets = df

In [32]:
data_with_targets

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,speciesR,above_median_sepal_length
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,1
146,6.3,2.5,5.0,1.9,2,1
147,6.5,3.0,5.2,2.0,2,1
148,6.2,3.4,5.4,2.3,2,1


In [33]:
data_with_targets.shape

(150, 6)

In [34]:
# select inputs for regression
data_with_targets.iloc[:, :-1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,speciesR
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [35]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [36]:
list(unscaled_inputs.columns)

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'speciesR']

## Standardize the data

In [38]:
sepal_length_scaler = StandardScaler()

In [39]:
sepal_length_scaler.fit(unscaled_inputs)

StandardScaler()

In [40]:
scaled_inputs = sepal_length_scaler.transform(unscaled_inputs)

In [41]:
scaled_inputs

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00, -1.22474487e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00, -1.22474487e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00, -1.22474487e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00

## Split data into train & test, then shuffle

In [44]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.6, random_state = 20)

In [45]:
print(x_train.shape, y_train.shape)

(90, 5) (90,)


In [46]:
print(x_test.shape, y_test.shape)

(60, 5) (60,)


## Logistic Regression with SkLearn

In [48]:
# train the model
reg = LogisticRegression()

In [49]:
reg.fit(x_train, y_train)

LogisticRegression()

In [50]:
# evaluate the model accuracy
reg.score(x_train, y_train)

0.9777777777777777

In [51]:
# In conclusion: utilizing logistic regression to predict above median sepal length has an accuracy of above 97%
# i.e.: the model learned to classify ~97% of the observations correctly

## Manually inspect accuracy of the model

In [54]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0])

In [55]:
# to view entire array
# np.set_printoptions(threshold=np.inf)
# not needed here

In [56]:
y_train

array([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0])

In [57]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [58]:
## finding the intercept & coefficients

In [59]:
reg.intercept_

array([-0.55688311])

In [60]:
reg.coef_

array([[ 2.69563441,  0.25639782,  1.09128681,  0.87163306, -0.14022958]])

In [62]:
unscaled_inputs.columns.values

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'speciesR'], dtype=object)

In [63]:
feature_name = unscaled_inputs.columns.values

In [64]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)