ручная имплементация линейной регрессии

In [2]:
import pandas as pd

def load_boston_data():
    url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
    data = pd.read_csv(url)
    X = data.drop(columns=['medv']).values
    y = data['medv'].values
    return X, y


In [3]:
import numpy as np

def normalize(X):
    X_mean = X.mean(axis=0)
    X_std = X.std(axis=0)
    return (X - X_mean) / X_std

def linear_regression(X, y, learning_rate=0.01, epochs=1000):
    w = np.zeros(X.shape[1])
    b = 0.0
    n = len(X)

    for _ in range(epochs):
        y_pred = X.dot(w) + b
        grad_w = -2 / n * X.T.dot(y - y_pred)
        grad_b = -2 / n * np.sum(y - y_pred)

        w -= learning_rate * grad_w
        b -= learning_rate * grad_b

    return w, b

def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [4]:
X, y = load_boston_data()
X = normalize(X)

split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

w, b = linear_regression(X_train, y_train)

y_pred = X_test.dot(w) + b

mse = mean_squared_error(y_test, y_pred)

print(f"Полученные коэффициенты: {w}")
print(f"Свободный член (интерсепт): {b}")
print(f"MSE для линейной регрессии на тестовой выборке: {mse}")

Полученные коэффициенты: [-1.51232973  0.95605998  0.06557457  0.54470492 -1.39601121  3.33007286
  0.03152965 -2.70040524  2.67456512 -1.51577457 -1.66990732  0.39905218
 -3.87486478]
Свободный член (интерсепт): 22.912762348175903
MSE для линейной регрессии на тестовой выборке: 24.122953138113157


имплементация линейной регрессии с помомщью библиотек

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

def load_boston_data():
    url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
    data = pd.read_csv(url)
    X = data.drop(columns=['medv']).values
    y = data['medv'].values
    return X, y


X, y = load_boston_data()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = LinearRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)

print(f"Коэффициенты модели: {model.coef_}")
print(f"Свободный член (интерсепт): {model.intercept_}")
print(f"MSE на тестовой выборке: {mse}")


Коэффициенты модели: [-1.00213533  0.69626862  0.27806485  0.7187384  -2.0223194   3.14523956
 -0.17604788 -3.0819076   2.25140666 -1.76701378 -2.03775151  1.12956831
 -3.61165842]
Свободный член (интерсепт): 22.796534653465375
MSE на тестовой выборке: 24.29111947497351


ручная имплементация логистической регрессии

In [6]:
from sklearn.datasets import load_breast_cancer

# Загрузка данных
def load_breast_cancer_data():
    data = load_breast_cancer()
    X = data.data
    y = data.target
    return X, y

In [7]:
def normalize(X):
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    return (X - means) / stds

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    w = np.random.randn(n)
    b = 0

    for _ in range(epochs):
        z = np.dot(X, w) + b
        y_pred = sigmoid(z)
        dz = y_pred - y
        dw = np.dot(X.T, dz) / m
        db = np.sum(dz) / m

        w -= learning_rate * dw
        b -= learning_rate * db

    return w, b

def accuracy(y_true, y_pred):
    y_pred_class = (y_pred >= 0.5).astype(int)
    return np.mean(y_true == y_pred_class)

def train_test_split_custom(X, y, test_size=0.2):
    m = len(X)
    test_size = int(m * test_size)
    indices = np.random.permutation(m)
    X_train, X_test = X[indices[:-test_size]], X[indices[-test_size:]]
    y_train, y_test = y[indices[:-test_size]], y[indices[-test_size:]]
    return X_train, X_test, y_train, y_test

In [8]:
X, y = load_breast_cancer_data()

X = normalize(X)

X_train, X_test, y_train, y_test = train_test_split_custom(X, y)

w, b = logistic_regression(X_train, y_train, learning_rate=0.001, epochs=5000)

y_pred = sigmoid(np.dot(X_test, w) + b)

acc = accuracy(y_test, y_pred)

print(f"Полученные коэффициенты: {w}")
print(f"Свободный член (интерсепт): {b}")
print(f"Точность логистической регрессии на тестовой выборке: {acc}")

Полученные коэффициенты: [ 0.53111525 -0.1325188   0.21408415 -0.10103908  0.910712   -0.69100881
 -1.16963646 -0.75544521 -0.40252279 -0.1308108  -0.88464128 -0.21608411
 -1.24318484 -0.00889182 -0.16934682  0.38043196 -1.09199572  1.35727347
  0.51263137  0.14878691  0.13198244  0.23925218 -0.9425043  -0.38025793
 -1.11692391 -0.22128254 -0.36256427 -0.58075158 -0.61197204  1.62416415]
Свободный член (интерсепт): 0.20122678113456788
Точность логистической регрессии на тестовой выборке: 0.911504424778761


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X, y = load_breast_cancer_data()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000, penalty=None)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print(f"Коэффициенты модели: {model.coef_}")
print(f"Свободный член (интерсепт): {model.intercept_}")
print(f"Точность логистической регрессии на тестовой выборке: {acc}")


Коэффициенты модели: [[   9.43309636  -16.98638813   40.02092582   10.89014962    5.03344322
   274.94009818 -134.8381584  -266.18473606   40.99650503 -168.01794604
  -259.63946217   35.91792789  115.35011389 -167.38940131  -79.54006031
   -97.5741303   246.60809325  -70.05165414   95.95418597  131.93923412
  -105.87135969  -91.92127648   73.01874488 -103.73668988   93.97496151
    29.28537604 -244.05971135   -8.90816016 -169.6548548    59.68383553]]
Свободный член (интерсепт): [-63.53260019]
Точность логистической регрессии на тестовой выборке: 0.9385964912280702
