In [48]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.metrics import r2_score, f1_score, accuracy_score

In [56]:
data = pd.read_csv("Data/adult.csv")

data["target"] = np.where(data["income"] == ">50K", 1, 0)

data.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,target
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,0


In [57]:
X_raw = data.drop(columns = ["income", "target"])
y = np.asarray(data["target"])

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]), X_raw.select_dtypes(include = "number").columns),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy = "most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output = False))
    ]), X_raw.select_dtypes(exclude = "number").columns)
])

X = pre.fit_transform(X_raw)

if type(X) != np.ndarray:
    X = X.toarray()

X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [58]:
class LogisticRegression2:
    def __init__(self, learning_rate : float = 0.001, iterations : int = 500, threshold : float = 0.5):
        self.learning_rate = learning_rate 
        self.iterations = iterations 
        self.threshold = threshold
        self.betas = None 
        self.intercept = 0.0 
        self.train_loss = []
        self.all_train_loss = []
        self.val_loss = []
        self.val_scores = []

    def initialize(self, n_features):
        self.betas = np.zeros(n_features, dtype = float)
        self.intercept = 0.0

    def predict_probs(self, x : np.array):
        logits = x @ self.betas + self.intercept
        probs = 1 / (1 + np.exp(-logits))
        return probs
    
    def track_loss(self, x : np.array, y : np.array):
        probs = self.predict_probs(x)
        eps = 1e-15
        probs = np.clip(probs, eps, 1 - eps)
        loss = -(y * np.log(probs) + (1 - y) * (np.log(1 - probs)))
        loss = np.mean(loss)
        return loss
    
    def compute_gradients_update_weights(self, x : np.array, y : np.array):
        
        for row in range(x.shape[0]):
            logit = x[row] @ self.betas + self.intercept
            prob = 1 / (1 + np.exp(-logit))

            der_loss_wrt_logits = prob - y[row]
            beta_gradients = x[row] * der_loss_wrt_logits
            intercept_gradient = der_loss_wrt_logits

            self.betas -= self.learning_rate * beta_gradients 
            self.intercept -= self.learning_rate * intercept_gradient 

        return self
    
    def find_threshold(self, x : np.array, y : np.array):
        probs = self.predict_probs(x)
        best_thresh = 0.0
        best_f1 = 0.0
        for thresh in np.unique(probs):
            preds = (probs >= thresh).astype(int)

            tp = np.sum((preds == 1) & (y == 1))
            fp = np.sum((preds == 1) & (y == 0))
            fn = np.sum((preds == 0) & (y == 1))

            precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
            recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0

            f1 = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) != 0 else 0.0
            if f1 > best_f1:
                best_f1, best_thresh = f1, thresh

        self.threshold = best_thresh

        return self.threshold
    
    def fit(self, x_train : np.array, y_train : np.array, x_val : np.array, y_val : np.array, patience = 5, min_delta = 0.0):
        
        self.initialize(x_train.shape[1])

        best_betas = None 
        best_intercept = None
        epochs_wo_improvement = 0.0
        best_val_loss = np.inf 

        for iteration in range(self.iterations):
            self.compute_gradients_update_weights(x_train, y_train)
            train_loss = self.track_loss(x_train, y_train)
            self.train_loss.append(train_loss)

            val_loss = self.track_loss(x_val, y_val)
            self.val_loss.append(val_loss)

            if best_val_loss - val_loss > min_delta:
                best_val_loss = val_loss 
                best_betas = self.betas.copy()
                best_intercept = self.intercept
                epochs_wo_improvement = 0.0
            else:
                epochs_wo_improvement += 1

            if epochs_wo_improvement >= patience:
                if best_betas is not None:
                    self.betas = best_betas
                    self.intercept = best_intercept
                break
        return self
    
    def cross_validate(self, x, y):
        x = np.asarray(x, dtype = float)
        y = np.asarray(y, dtype = float)

        results = []

        cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(x, y)):
            x_train = x[train_idx]
            y_train = y[train_idx]
            x_val = x[val_idx]
            y_val = y[val_idx]

            model = LogisticRegression2(
                learning_rate = self.learning_rate,
                iterations = self.iterations,
                threshold = self.threshold
            )

            model.fit(x_train, y_train, x_val, y_val)
            avg_train_loss = np.mean(model.train_loss)
            
            probs = model.predict_probs(x_val)
            thresh = model.find_threshold(x_val, y_val)
            preds = (probs >= thresh).astype(int)

            val_score = accuracy_score(y_val, preds)
            val_f1 = f1_score(y_val, preds)

            results.append({
                "Fold" : fold_idx,
                "Train_Loss" : float(round(avg_train_loss, 2)),
                "Threshold" : float(round(thresh, 2)),
                "Val_F1" : round(val_f1, 2),
                "Accuracy" : round(val_score, 2)
            })

        return results

In [59]:
class_instance = LogisticRegression2(learning_rate = 0.001, iterations = 10)
class_instance.cross_validate(X_train_all, y_train_all)

[{'Fold': 0,
  'Train_Loss': 0.33,
  'Threshold': 0.34,
  'Val_F1': 0.7,
  'Accuracy': 0.85},
 {'Fold': 1,
  'Train_Loss': 0.33,
  'Threshold': 0.33,
  'Val_F1': 0.7,
  'Accuracy': 0.84},
 {'Fold': 2,
  'Train_Loss': 0.33,
  'Threshold': 0.31,
  'Val_F1': 0.69,
  'Accuracy': 0.84},
 {'Fold': 3,
  'Train_Loss': 0.33,
  'Threshold': 0.38,
  'Val_F1': 0.7,
  'Accuracy': 0.85},
 {'Fold': 4,
  'Train_Loss': 0.33,
  'Threshold': 0.29,
  'Val_F1': 0.67,
  'Accuracy': 0.82}]