In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Model LogisticRegression (MLE)

In [2]:
class LogisticRegression:
    def __init__(self, lr = 0.1, eps = 1e-6, num_iter = 1000, verbose = False):
        self.lr = lr
        self.eps = eps
        self.num_iter = num_iter
        self.verbose = verbose

    def add_intercept(self, X):
        if X.ndim == 1:
            X = X.reshape(-1,1)
        intercept = np.ones((X.shape[0],1))
        return np.concatenate((intercept, X), axis = 1)
    
    def sigmoid(self, x):
        return 1/(1 + np.exp(-x + self.eps))
    
    def loss(self, m, y, y_hat):
        return -(1/m)*(y.T @ np.log(y_hat + self.eps) + (1-y).T @ np.log(1-y_hat + self.eps))
    
    def grad_desc(self, X, y):
        m = len(y)
        y_hat = self.sigmoid( np.dot(X, self.theta))
        grad = (1/m)* X.T @ (y_hat - y)
        new_theta = self.theta - self.lr*grad
        return new_theta
    
    def fit(self, X, y):
        if y.ndim == 1:
            y = y.reshape(-1,1)

        samples, features = X.shape
        X = self.add_intercept(X)

        self.theta = np.zeros((features+1,1))

        pbar = tqdm(range(self.num_iter))

        for epoch in pbar:

            y_hat = self.sigmoid( X @ self.theta)

            new_theta = self.grad_desc(X,y) 

            if np.linalg.norm(new_theta - self.theta, 1) < self.eps:
                self.theta = new_theta
                break

            self.theta = new_theta

            if self.verbose is True:
                loss = self.loss(samples, y, y_hat)
                pbar.set_description(f'epoch {epoch}: loss = {loss.item():.2f}')   

    def predict(self, X):
        X = self.add_intercept(X)
        y_pred = self.sigmoid( X @ self.theta)
        return (y_pred >= 0.5).astype(int)

    def accuracy(self, X, y):
        y_pred = self.predict(X)
        if y.ndim == 1:
            y = y.reshape(-1,1)
        accuracy = np.mean(y_pred == y)
        print(f'Accuracy: {accuracy:.3}')

# Using built model to solve problems

## Iris

### Exploring data

In [3]:
df_iris = pd.read_csv('iris.csv')

In [4]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
df_iris.shape

(150, 5)

In [6]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Column species is object that is not a number, so we need to convert column species into a integer number

In [7]:
# Check how many species in species column
df_iris.species.nunique()

3

### Processing data

In [8]:
mask = df_iris.species.isin(['setosa', 'versicolor'])
df_iris = df_iris[mask].copy()

In [9]:
df_iris.shape

(100, 5)

In [10]:
mapping = {'setosa': 0, 'versicolor': 1}
df_iris.species = df_iris.species.map(mapping)
df_iris.species

0     0
1     0
2     0
3     0
4     0
     ..
95    1
96    1
97    1
98    1
99    1
Name: species, Length: 100, dtype: int64

In [11]:
X_iris = df_iris.iloc[:100,:-1]
y_iris = df_iris.iloc[:100, -1]
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris,
    test_size = 0.4,
    random_state = 42,
    shuffle = True
)
X_train_iris = X_train_iris.to_numpy()
y_train_iris = y_train_iris.to_numpy()

X_test_iris = X_test_iris.to_numpy()
y_test_iris = y_test_iris.to_numpy()

### Using model

In [12]:
model = LogisticRegression(
    lr = 0.1,
    eps = 1e-6,
    num_iter = 1000,
    verbose = True
)

In [13]:
model.fit(X_train_iris, y_train_iris)

epoch 999: loss = 0.01: 100%|██████████| 1000/1000 [00:01<00:00, 946.30it/s]


### Testing

In [14]:
mapping = {0:'setosa', 1:'versicolor'}
y_pred_sr = pd.Series(model.predict(X_test_iris).ravel(), name ='Predict')
y_test_sr = pd.Series(y_test_iris.ravel(), name = 'Result')

y_pred_sr = y_pred_sr.map(mapping)
y_test_sr = y_test_sr.map(mapping)

df_cmpr = pd.concat([y_pred_sr, y_test_sr], axis = 1)
print(df_cmpr)

       Predict      Result
0   versicolor  versicolor
1   versicolor  versicolor
2   versicolor  versicolor
3       setosa      setosa
4       setosa      setosa
5       setosa      setosa
6       setosa      setosa
7   versicolor  versicolor
8       setosa      setosa
9       setosa      setosa
10      setosa      setosa
11      setosa      setosa
12  versicolor  versicolor
13      setosa      setosa
14  versicolor  versicolor
15      setosa      setosa
16  versicolor  versicolor
17  versicolor  versicolor
18      setosa      setosa
19      setosa      setosa
20  versicolor  versicolor
21  versicolor  versicolor
22      setosa      setosa
23      setosa      setosa
24  versicolor  versicolor
25      setosa      setosa
26      setosa      setosa
27  versicolor  versicolor
28      setosa      setosa
29  versicolor  versicolor
30      setosa      setosa
31      setosa      setosa
32  versicolor  versicolor
33      setosa      setosa
34  versicolor  versicolor
35      setosa      setosa
3

In [15]:
model.accuracy(X_test_iris, y_test_iris)

Accuracy: 1.0
