In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
                            f1_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.base import ClassifierMixin, BaseEstimator
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
def generate_data(n_samples=10000, n_features=1):
    X, y = make_classification(
        n_features=2,
        n_redundant=0,
        n_informative=2,
        n_clusters_per_class=1,
        n_classes=2,
        random_state=42,
    )
    return X, y

In [None]:
def plot_dataset(X, y):
    plt.figure(figsize=(10,6))
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='Spectral')
    plt.show()

In [None]:
X, y = generate_data()
plot_dataset(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 42
)

In [None]:
plot_dataset(X_train, y_train)

In [None]:
plot_dataset(X_test, y_test)

In [None]:
#from sklearn.inspection import DecisionBoundaryDisplay
def get_grid(data):
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    return np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))

In [None]:
def plot_prediction(X, y, model):
    xx, yy = get_grid(X)
    predict = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.figure(figsize=(10,6))
    plt.pcolormesh(xx, yy, predict, cmap='Spectral')
    plt.scatter(X[:, 0], X[:, 1], c=y, s=100, cmap='Spectral',edgecolors='k')
    plt.show()

In [None]:
def print_metrics(true, predict):
    acc = accuracy_score(y_test, predict)
    prec = precision_score(y_test, predict)
    recall = recall_score(true, predict)
    f1 = f1_score(y_test, predict)
    print(f'Results:\naccuracy:   {acc:.3f}\nprecision: {prec:.3f}\nrecall:  {recall:.3f}\nf1:  {f1:.3f}') 
    print(classification_report(y_test, predict))
    print(confusion_matrix(y_test, predict))

In [None]:
class MyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, strategy = 'mean', iterations = 1000, alpha = 0.0001, verbose = False):
        self.strategy = strategy
        self.iterations = iterations
        self.alpha = alpha
        self.verbose = verbose
        
    def fit(self, X, y):
        if self.strategy == 'most_frequent':
            self.y_predict = np.argmax(np.bincount(y))
        elif self.strategy == 'gradient_descent':
            self.weights = np.zeros((3,1))
            len_data = X.shape[0]
            X_b = self.add_bias(X)
            y = y.reshape(len_data, 1)
            losses = []
            for i in range(self.iterations):
                predict = self.sigmoid(np.dot(X_b, self.weights))
                loss = (y * np.log(predict)) + ((1 - y) * np.log(1 - predict)) 
                error = predict - y
                self.weights = self.weights - (self.alpha/len_data) * np.dot(X_b.T, error)
                losses.append(loss)
                if self.verbose:
                    if i%100==0:
                        print(f'step: {i},error: {error.shape} loss:{loss}, weights: {self.weights}')
            print(f'Weigths: {self.weights}')
        
    @staticmethod
    def add_bias(X):
        len_data = X.shape[0]
        ones = np.ones(len_data).reshape(len_data, 1)
        return np.concatenate((ones, X), 1)

    @staticmethod
    def sigmoid(x):    
        return 1/(1 + np.exp(-x))
    
    def predict(self, X):
        if self.strategy == 'most_frequent':
            return np.array(X.shape[0] * [self.y_predict])
        elif self.strategy == 'gradient_descent':
            X_b = self.add_bias(X)
            return (np.dot(X_b, self.weights).flatten() > 0).astype(int)

In [None]:
model = MyClassifier(strategy = 'most_frequent')
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)

In [None]:
model = DummyClassifier(strategy='most_frequent')
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
model = MyClassificator(strategy = 'gradient_descent', iterations = 100000, alpha = 0.001)
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)