In [297]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [359]:
ololo1 = np.array([1,2,3])
ololo2 = np.array([3,4,5])
ololo1 * ololo2

array([ 3,  8, 15])

In [416]:
class LogRegression():
    def __init__(self, lr=0.01, batch_size=10, max_iterations=1000, min_profit_step=0.001):
        self.lr = lr
        self.batch_size = batch_size
        self.max_iterations = max_iterations
        self.W = None
        
    def get_cost(self, X, y):
        return np.mean(np.log(1 + np.exp(-y * np.dot(X, self.W).flatten())))
    
    def fit(self, X_train, y_train, X_val, y_val):
        # Here we add the fake column to simplify calculations
        X_ = np.concatenate((np.ones([X_train.shape[0], 1]), X_train), axis=1)
        X_val_ = np.concatenate((np.ones([X_val.shape[0], 1]), X_val), axis=1)
        self.W = np.ones([X.shape[1]+1, 1]) * 0.0001
        iterations = 0
        accuracy_v = []
        n = X_.shape[0]
        previous_cost = None
        
        while(iterations < self.max_iterations):
            sample_ids = np.random.randint(0, n, self.batch_size)
            sample_X = X_[sample_ids, :]
            sample_y = y_train[sample_ids]
            cost_v = []
            for i in range(self.batch_size):
                exp = 1 + np.exp(sample_y[i] * np.dot(self.W.flatten(), sample_X[i, :]))
                cost_v.append((sample_X[i, :] * sample_y[i]) / exp)
            previous_W = self.W
            self.W = self.W + self.lr * (np.sum(np.array(cost_v), axis=0).reshape(self.W.shape))
            cost = self.get_cost(X_val_, y_val)
            if previous_cost:
                cost_diff = previous_cost - cost
                if (cost_diff < 0) or (cost_diff < min_profit_step*previous_cost):
                    self.W = previous_W
                    break
            previous_cost = cost
            accuracy_v.append(accuracy_score(self.predict(X_val), y_val))
            iterations += 1
        return accuracy_v
    
    def predict(self, X):
        X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1)
        predictions_0_1 = 1 / (1 + np.exp(-1 * np.dot(X_, self.W)))
        return [1 if p > 0.5 else -1 for p in predictions_0_1]

### Task 1 (Logistic regression)

In [291]:
df = pd.read_csv('../data/spam.csv')
df.label = df.label.map(lambda y: y if y == 1 else -1)
X = df[df.columns.difference(['label'])].values
y = df.label.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

Library algorithm

In [293]:
lib_logr = LogisticRegression(solver='liblinear', random_state=42)
lib_logr.fit(X_train, y_train)
print(f"Accuracy: {accuracy_score(lib_logr.predict(X_val), y_val)}")

Accuracy: 0.9185667752442996


Realized algorithm

In [431]:
lr=0.001
batch_size=16
min_profit_step=0.0001
logr = LogRegression(lr=lr, batch_size=batch_size, max_iterations=100, min_profit_step=min_profit_step)
accuracy_v = logr.fit(X_train, y_train, X_val, y_val)
print(f"Accuracy: {accuracy_score(logr.predict(X_val), y_val)}")
trace = go.Scatter(x=np.arange(len(accuracy_v)), y=accuracy_v)
fig = go.Figure(data=[trace], layout=go.Layout(title=f'Accuracy (lr={lr}, batch_size={batch_size}, min_profit_step={min_profit_step})'))
fig.update_xaxes(title_text="Iteration")
fig.update_yaxes(title_text="Accuracy")

Accuracy: 0.9044516829533116
