In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
X = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 2, 1, 3, 0, 5, 10, 1, 2],
              [500, 700, 750, 600, 1450, 800, 1500, 2000, 450, 1000],
              [1, 1, 2, 1, 2, 1, 3, 3, 1, 2]], dtype = np.float64)

y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1], dtype = np.float64)

In [3]:
model = LinearRegression()
X = MinMaxScaler().fit_transform(X.T)
model.fit(X, y)

LinearRegression()

In [4]:
y_pred = model.predict(X)
y_pred

array([0.2617132 , 0.17509907, 1.02794228, 0.21840613, 0.53795688,
       0.31862798, 1.20396413, 0.05324898, 0.28336673, 0.91967463])

In [5]:
y_pred = model.predict(X)
mean_squared_error(y, y_pred)

0.1027095355172261

In [6]:
def calc_std_feat(x):
    res = (x - x.mean()) / x.std()
    return res

In [7]:
X_st = X.copy()
X_st[1, :] = calc_std_feat(X[1, :])
X_st[2, :] = calc_std_feat(X[2, :])
X_st[3, :] = calc_std_feat(X[3, :])

In [50]:
#примитивный способ избавится от 0 под логарифмом
def calc_logloss(y, y_pred):
    for i in range(len(y_pred)):
        if y_pred[i] < 0.018:
            y_pred[i] = 0.009
    err = - np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))
    return err

In [51]:
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

In [52]:
### Logistic Regression

In [53]:
def eval_model(X, y, iterations, alpha=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        z = np.dot(W, X)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        print()
        W -= alpha * (1/n * np.dot((y_pred - y), X.T))
    if i % (iterations / 10) == 0:
        print(i, W, err)
    return W

In [54]:
X_st.shape (4,10)

(10, 4)

In [55]:
y.shape

(10,)

In [56]:
#W = eval_model(X_st.T, y, iterations=1000, alpha=1e-5)

1000 [ 0.49692884 -0.13928575  0.64646023  1.52320309] 0.5722993196391095


In [58]:
#1) в функции потерь заменил околонулевые значения на оптимальные

In [57]:
#2) подбор гиперпараметров для логистической регрессии

In [67]:
W = eval_model(X_st.T, y, iterations=10000, alpha=1e-1)

10000 [ 1.96482517 -8.84585302 -3.69692174 11.1524396 ] 0.26972481031406786


In [None]:
#3) создание ункции calc_pred_proba

In [71]:
X_st.shape, W.shape

((10, 4), (4,))

In [82]:
def calc_pred_proba(X, W):
    y_pred_proba = sigmoid(np.dot(X, W))
    return y_pred_proba

In [83]:
y_pred_proba = calc_pred_proba(X_st, W)
y_pred_proba

array([2.68186937e-01, 2.62110296e-10, 9.99999937e-01, 6.10496730e-12,
       6.31196466e-01, 3.02633461e-01, 9.85587567e-01, 1.99354256e-01,
       2.92228494e-01, 9.23807329e-01])

In [84]:
#4) создание ункции calc_pred

In [86]:
def calc_pred(X, W):
    y_pred = []
    for probability in calc_pred_proba(X, W):
        if probability >= 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

In [87]:
y_pred = calc_pred(X_st, W)
y_pred

[0, 0, 1, 0, 1, 0, 1, 0, 0, 1]

In [None]:
#5) считаем метрики

In [94]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [89]:
accuracy_score(y, y_pred)

0.9

In [90]:
prec = precision_score(y, y_pred)
prec

1.0

In [91]:
recall = recall_score(y, y_pred)
recall

0.8

In [93]:
f1 = 2*(prec*recall)/(prec+recall)
f1

0.888888888888889

In [95]:
confusion_matrix(y, y_pred)

array([[5, 0],
       [1, 4]])

In [1]:
#6) модель не могла переобучится так как данных слишком мало

In [97]:
#7) L1

In [100]:
def eval_model_L1(X, y, iterations, alpha=1e-4, lambda_=1e-8):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        z = np.dot(W, X)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * (np.dot((y_pred - y), X.T)+ 1/(W*lambda_)))
    if i % (iterations / 10) == 0:
        print(i, W, err)
    return W

In [101]:
#7) L2

In [102]:
def eval_model_L1(X, y, iterations, alpha=1e-4, lambda_=1e-8):
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    for i in range(1, iterations+1):
        z = np.dot(W, X)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        W -= alpha * (1/n * (np.dot((y_pred - y), X.T)+ 2*W*lambda_))
    if i % (iterations / 10) == 0:
        print(i, W, err)
    return W