# Задача 1

**Загрузите датасет для регрессии, выполните предварительную обработку (удалите пропуски, приведите все признаки к числам), выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку.**

In [93]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from math import sqrt, pow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from datetime import datetime as dt
import folium

- **Загрузка датасета**

In [2]:
df = pd.read_csv('../data/trip_duration_task.csv')

df

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,id1080784,2.0,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,400
1,id0889885,1.0,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,1100
2,id0857912,2.0,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,1635
3,id3744273,2.0,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,1141
4,id0232939,1.0,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,848
...,...,...,...,...,...,...,...,...,...,...
729317,id3905982,2.0,2016-05-21 13:29:38,2016-05-21 13:34:34,2,-73.965919,40.789780,-73.952637,40.789181,296
729318,id0102861,1.0,2016-02-22 00:43:11,2016-02-22 00:48:26,1,-73.996666,40.737434,-74.001320,40.731911,315
729319,id0439699,1.0,2016-04-15 18:56:48,2016-04-15 19:08:01,1,-73.997849,40.761696,-74.001488,40.741207,673
729320,id2078912,1.0,2016-06-19 09:50:47,2016-06-19 09:58:14,1,-74.006706,40.708244,-74.013550,40.713814,447


## Предобработка датасета

- **Удаление столбцов id, dropoff_datetime(т.к. является искомым)**

In [3]:
df1 = df.drop(["id", "dropoff_datetime"], axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   vendor_id          727135 non-null  float64
 1   pickup_datetime    729322 non-null  object 
 2   passenger_count    729322 non-null  int64  
 3   pickup_longitude   729322 non-null  float64
 4   pickup_latitude    727475 non-null  float64
 5   dropoff_longitude  729322 non-null  float64
 6   dropoff_latitude   729322 non-null  float64
 7   trip_duration      729322 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 44.5+ MB


- **Исключение пустых строк**

In [4]:
df2 = df1.dropna()

- **Отбор основных путей перездок для улучшения результата прогноза**

In [5]:
dfl = df.head(500)
m = folium.Map(location=[40.753093719482415, -73.97315979003906], zoom_start=12)
result = [folium.Marker([x1, y1]).add_to(m) and folium.Marker([x2, y2]).add_to(m) for x1, y1, x2, y2 in zip(dfl['pickup_latitude'], dfl['pickup_longitude'], dfl['dropoff_latitude'], dfl['dropoff_longitude'])]
m.save("name1.html")
#40.691510, -74.020491
#40.820779, -73.919535

df3 = df2[(40.691510 < df2.pickup_latitude) & (40.691510 < df2.dropoff_latitude) & (df2.pickup_latitude < 40.820779) & (df2.dropoff_latitude < 40.820779)]
df3 = df3[(-74.020491 < df3.pickup_longitude) & (-74.020491 < df3.dropoff_longitude) & (df3.pickup_longitude < -73.919535) & (df3.dropoff_longitude < -73.919535)]

![plot](./loc.png)

- **Выделение прзнака из даты отправки - часовой период, из кол-ва пассажиров - 2 типа вместительности; удаление столбца vendor_idt**

In [6]:
df4 = df3
df_hours = pd.get_dummies(df4["pickup_datetime"].apply(lambda date: int(date[11:13])), prefix="hour")
df4["hour_4_9"] = df_hours.hour_4 + df_hours.hour_5 + df_hours.hour_6 + df_hours.hour_7 + df_hours.hour_8 + df_hours.hour_9
df4["hour_10_15"] = df_hours.hour_10 + df_hours.hour_11 + df_hours.hour_12 + df_hours.hour_13 + df_hours.hour_14 + df_hours.hour_15
df4["hour_16_15"] = df_hours.hour_16 + df_hours.hour_17 + df_hours.hour_18 + df_hours.hour_19 + df_hours.hour_20 + df_hours.hour_21
df4["hour_night"] = df_hours.hour_22 + df_hours.hour_23 + df_hours.hour_0 + df_hours.hour_1 + df_hours.hour_2 + df_hours.hour_3

print(df4["passenger_count"].unique())
df4["pas0-3"] = df4["passenger_count"].apply(lambda count: 1 if count < 4 else 0)
df4["pas4-6"] = df4["passenger_count"].apply(lambda count: 1 if count > 3 else 0)

df4 = df4.drop(["pickup_datetime", "vendor_id", "passenger_count"], axis=1)
df4.info()

[1 2 6 3 4 5 0]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 629577 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_longitude   629577 non-null  float64
 1   pickup_latitude    629577 non-null  float64
 2   dropoff_longitude  629577 non-null  float64
 3   dropoff_latitude   629577 non-null  float64
 4   trip_duration      629577 non-null  int64  
 5   hour_4_9           629577 non-null  uint8  
 6   hour_10_15         629577 non-null  uint8  
 7   hour_16_15         629577 non-null  uint8  
 8   hour_night         629577 non-null  uint8  
 9   pas0-3             629577 non-null  int64  
 10  pas4-6             629577 non-null  int64  
dtypes: float64(4), int64(3), uint8(4)
memory usage: 40.8 MB


- **Избавляемся от слишком завышенных параметров времени поездки**

In [7]:
df5 = df4
print(df5[df5["trip_duration"]>8000].shape[0])
df5 = df5[df5["trip_duration"]<8000]
df5.info()

838
<class 'pandas.core.frame.DataFrame'>
Int64Index: 628739 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_longitude   628739 non-null  float64
 1   pickup_latitude    628739 non-null  float64
 2   dropoff_longitude  628739 non-null  float64
 3   dropoff_latitude   628739 non-null  float64
 4   trip_duration      628739 non-null  int64  
 5   hour_4_9           628739 non-null  uint8  
 6   hour_10_15         628739 non-null  uint8  
 7   hour_16_15         628739 non-null  uint8  
 8   hour_night         628739 non-null  uint8  
 9   pas0-3             628739 non-null  int64  
 10  pas4-6             628739 non-null  int64  
dtypes: float64(4), int64(3), uint8(4)
memory usage: 40.8 MB


- **Выделение целевого признака и предикатов.**

In [8]:
y = df5['trip_duration']
X = df5.drop(['trip_duration'], axis = 1)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((188621, 10), (188621,), (440118, 10), (440118,))

# Задача 2

**Решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр.**

- **!Подготовка нескольких гиперпараметров**

In [11]:
parameters = {'alpha': np.arange(0.1, 1, 0.05)}

- **Линейная регрессия.**

In [113]:
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [114]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr = r2_score(y_pred, y_test)
print(f'R^2: {score_lr}')

MAE: 341.11165187517895
MSE: 203634.64584582997
RMSE: 451.25895652699234
MAPE: 1.0196725110633162
R^2: -31.324020524673877


In [115]:
lr.coef_

array([-1.99293250e+03, -2.17956037e+02,  2.75678045e+03, -2.77367576e+03,
       -9.88144914e+14, -9.88144914e+14, -9.88144914e+14, -9.88144914e+14,
       -1.15919226e+15, -1.15919226e+15])

- **Линейная регрессия. Регуляризация L1 / Ridge.**

In [116]:
# L1(RandomizedSearchCV)
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge = Ridge(alpha=ridge_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = ridge.predict(X_test)


In [117]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L1_RandomSearchCV = r2_score(y_pred, y_test)
print(f'R^2: {score_lr_L1_RandomSearchCV}')

MAE: 341.07249114756905
MSE: 203629.64862191404
RMSE: 451.2534195127102
MAPE: 1.0195370815279488
R^2: -31.768557646892674


In [118]:
ridge.coef_

array([-1964.34156997,  -222.93839293,  2725.31972878, -2757.36071385,
         -31.28912331,    80.55209789,    15.49758785,   -64.76056251,
          -4.85354084,     4.85354083])

In [119]:
# L1(GridSearchCV)
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge = Ridge(alpha=ridge_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [120]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L1_GridSearchCV = r2_score(y_pred, y_test)
print(f'R^2: {score_lr_L1_GridSearchCV}')

MAE: 341.07249114756905
MSE: 203629.64862191404
RMSE: 451.2534195127102
MAPE: 1.0195370815279488
R^2: -31.768557646892674


In [121]:
ridge.coef_

array([-1964.34156997,  -222.93839293,  2725.31972878, -2757.36071385,
         -31.28912331,    80.55209789,    15.49758785,   -64.76056251,
          -4.85354084,     4.85354083])

- **Линейная регрессия. Регуляризация L2 / Lasso.**

In [122]:
# L2(RandomizedSearchCV)
lasso_optimal = RandomizedSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso = Lasso(alpha=lasso_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [123]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L2_RandomSearchCV = r2_score(y_pred, y_test)
print(f'R^2: {score_lr_L2_RandomSearchCV}')

MAE: 341.2959699712704
MSE: 203859.9602937503
RMSE: 451.5085384505484
MAPE: 1.020180086831387
R^2: -40.87055544168539


In [124]:
lasso.coef_

array([-1.25114887e+03, -4.17112169e+02,  1.75349077e+03, -2.19594381e+03,
       -2.50487021e+01,  8.56661051e+01,  2.22177022e+01, -5.28396309e+01,
       -8.50368737e+00,  5.75595419e-15])

In [125]:
# L2(GridSearchCV)
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso = Lasso(alpha=lasso_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [126]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L2_GridSearchCV = r2_score(y_pred, y_test)
print(f'R^2: {score_lr_L2_GridSearchCV}')

MAE: 341.2959699712704
MSE: 203859.9602937503
RMSE: 451.5085384505484
MAPE: 1.020180086831387
R^2: -40.87055544168539


In [127]:
lasso.coef_

array([-1.25114887e+03, -4.17112169e+02,  1.75349077e+03, -2.19594381e+03,
       -2.50487021e+01,  8.56661051e+01,  2.22177022e+01, -5.28396309e+01,
       -8.50368737e+00,  5.75595419e-15])

# Задача 3

**Решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр.**

- **Добавление предиката, степень полинома 2.**

In [136]:
pf = PolynomialFeatures(4)
X_train_p = pf.fit_transform(X_train)
X_test_p = pf.fit_transform(X_test)


- **Полиномиальная регрессия.**

In [137]:
pr = LinearRegression().fit(X_train_p, y_train)
y_pred = pr.predict(X_test_p)


In [138]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_pr = r2_score(y_pred, y_test)
print(f'R^2: {score_pr}')

MAE: 217.96742324288599
MSE: 99205.3526712109
RMSE: 314.96881221989406
MAPE: 0.7532952690348965
R^2: 0.12677161467913023


In [30]:
pr.coef_

array([ 9.62274759e+02, -8.54688002e+08, -1.03356501e+09, ...,
        0.00000000e+00,  0.00000000e+00, -4.38519146e+06])

- **Полиномиальная регрессия. Регуляризация L1 / Ridge.**

In [139]:
ridge = Ridge().fit(X_train_p, y_train)
y_pred = ridge.predict(X_test_p)

In [140]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_pr_L1 = r2_score(y_pred, y_test)
print(f'R^2: {score_pr_L1}')

MAE: 241.29299822364837
MSE: 117277.6125851629
RMSE: 342.4581910031689
MAPE: 0.8186678954252306
R^2: -0.2725200597986355


# Задача 4

**Вычислите значения метрик $R^2$, MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель.**

- **Сравнение всех моделей по наибольшему коэффициенту детерминации.**

In [143]:
print(score_lr)
print(score_lr_L1_RandomSearchCV)
print(score_lr_L1_GridSearchCV)
print(score_lr_L2_RandomSearchCV)
print(score_lr_L2_GridSearchCV)
print("----")
print(score_pr)
print(score_pr_L1)

-31.324020524673877
-31.768557646892674
-31.768557646892674
-40.87055544168539
-40.87055544168539
----
0.12677161467913023
-0.2725200597986355


**Наилучшей моделью оказалась: Полиномиальная регрессия 4 степени**

# Задача 5
- **Самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2).**
- **Самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента).**
- **Обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.**

In [142]:
class Metrics:
    @staticmethod
    def  mean_absolute_error(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs(real - pred)))
    @staticmethod
    def mean_squared_error(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(np.mean((real - pred) ** 2))
    @staticmethod
    def root_mean_squared_error(y_test, y_pred):
        return float(np.sqrt(Metrics.mean_squared_error(y_test, y_pred)))
    @staticmethod
    def mean_absolute_percentage_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((real - pred) / np.abs(real)))
    @staticmethod
    def r_2_score(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(1 - np.sum((real - pred)**2) / np.sum((real - np.mean(real))**2))

In [407]:
class MyLinearRegression:
    def __init__(self, iterations = 1000, learning_rate = 1e-4):
        self.lr = learning_rate  
        self.i = iterations        

    def loss_func(self, x, y, w):
        return sum((np.dot(x, w) - y) ** 2)/x.shape[1] #функция потерь sum((<w, x> - y)^2) * 1/len(x)
    
    def gradient(self, X, y, XT, l, w):
        return w - self.lr * 2 * np.dot(XT, np.dot(X, w) - y) / l #функция для вычисления градиента
                                                                  #w -n∇Q(w, X) = w - 2n/l * X.T(Xw - y)

    def fit(self, x, y):
        loss_abs = np.inf #переменная, отвечающая за прекращение работы алгоритма при минимальном изменении знач. функции потерь 
        eps = 1e-4
        
        X = x#self.transform(x)
        w = np.zeros(X.shape[1]) #изначально веса = 0
        iter = 0
        XT = X.T
        l = X.shape[0]
        while iter <= self.i:
            loss = self.loss_func(X, y, w)
            if (iter % 10 == 0):
                print(iter, self.lr, loss_abs)
            w_save = w
            w = self.gradient(X, y, XT, l, w)
            loss_abs_save = loss_abs
            loss_abs = np.abs(loss - self.loss_func(X, y,w))
            if (loss_abs > loss_abs_save):
                loss_abs = loss_abs_save
                w = w_save
                self.lr *= 0.95
            #else:
            #    self.lr *= 1.01 
            if (loss_abs <= eps):
                break
            #else:
            #    w = self.gradient(X, y, XT, l, w)
            iter += 1
            
        self.w = w
        
        
    def predict(self, x):
        return np.dot(x, self.w) #ожидаемые y

In [359]:
(X1_train, 
 X2_train, 
 y1_train, y2_train) = train_test_split(X_train_p, y_train, 
                                     test_size=0.95, 
                                     random_state=0)
len(X1_train)

9431

In [404]:
from sklearn.datasets import make_regression
Xmr, ymr = make_regression(n_samples=1000, n_features=1, noise=30, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(Xmr, ymr, test_size=0.3)
len(X3_train)
print(X3_train.shape)

(700, 4)


In [408]:
#print(pr.coef_)

mlr = MyLinearRegression(iterations =10000, learning_rate = 1e-4)
mlr.fit(X3_train, y3_train)

0 0.0001 inf
10 0.0001 1037.5145412846468
20 0.0001 1033.295898557175
30 0.0001 1029.0944729568437
40 0.0001 1024.9101939559914
50 0.0001 1020.7429913482629
60 0.0001 1016.5927952206694
70 0.0001 1012.4595359060913
80 0.0001 1008.3431440489367
90 0.0001 1004.243550603278
100 0.0001 1000.1606867564842
110 0.0001 996.0944840167649
120 0.0001 992.0448741605505
130 0.0001 988.0117892394774
140 0.0001 983.9951615841128
150 0.0001 979.9949238272384
160 0.0001 976.0110088526271
170 0.0001 972.0433498118073
180 0.0001 968.0918801641092
190 0.0001 964.1565336175263
200 0.0001 960.2372441585176
210 0.0001 956.3339460338466
220 0.0001 952.4465737938881
230 0.0001 948.5750622195192
240 0.0001 944.719346374739
250 0.0001 940.8793615996838
260 0.0001 937.0550434854813
270 0.0001 933.2463279031217
280 0.0001 929.4531509508379
290 0.0001 925.675449045375
300 0.0001 921.9131588186137
310 0.0001 918.1662171892822
320 0.0001 914.4345613159239
330 0.0001 910.7181286225095
340 0.0001 907.0168568058871
350 

3470 0.0001 254.3341480854433
3480 0.0001 253.3056697390275
3490 0.0001 252.2813673838973
3500 0.0001 251.26122400455642
3510 0.0001 250.24522264499683
3520 0.0001 249.2333464260446
3530 0.0001 248.2255785331363
3540 0.0001 247.221902222489
3550 0.0001 246.22230082086753
3560 0.0001 245.22675771953072
3570 0.0001 244.23525637947023
3580 0.0001 243.2477803286165
3590 0.0001 242.26431316137314
3600 0.0001 241.28483854455408
3610 0.0001 240.30934020353016
3620 0.0001 239.33780193713028
3630 0.0001 238.37020760553423
3640 0.0001 237.40654113935307
3650 0.0001 236.44678653334267
3660 0.0001 235.49092784291133
3670 0.0001 234.53894919378217
3680 0.0001 233.59083478152752
3690 0.0001 232.64656885049772
3700 0.0001 231.70613573049195
3710 0.0001 230.76951979706064
3720 0.0001 229.83670549758244
3730 0.0001 228.90767734486144
3740 0.0001 227.98241990956012
3750 0.0001 227.06091783207376
3760 0.0001 226.1431558073964
3770 0.0001 225.22911860235035
3780 0.0001 224.3187910390552
3790 0.0001 223.41

6800 0.0001 66.23133009974845
6810 0.0001 65.96504029649077
6820 0.0001 65.6998258936801
6830 0.0001 65.4356825290597
6840 0.0001 65.1726058596978
6850 0.0001 64.91059156070696
6860 0.0001 64.6496353217517
6870 0.0001 64.38973285414977
6880 0.0001 64.13087988394545
6890 0.0001 63.87307215668261
6900 0.0001 63.61630543222418
6910 0.0001 63.36057549121324
6920 0.0001 63.105878128728364
6930 0.0001 62.85220915824175
6940 0.0001 62.59956441092072
6950 0.0001 62.34793973300839
6960 0.0001 62.0973309885012
6970 0.0001 61.847734058857895
6980 0.0001 61.5991448413115
6990 0.0001 61.35155925102299
7000 0.0001 61.1049732195097
7010 0.0001 60.859382692025974
7020 0.0001 60.61478363460628
7030 0.0001 60.37117202853551
7040 0.0001 60.128543869417626
7050 0.0001 59.88689517060993
7060 0.0001 59.646221962117124
7070 0.0001 59.406520287913736
7080 0.0001 59.167786211008206
7090 0.0001 58.93001580925193
7100 0.0001 58.69320517528104
7110 0.0001 58.457350420590956
7120 0.0001 58.22244766930817
7130 0.00

In [409]:
pred1 = mlr.predict(X3_test)
#pred2 = lr.predict(X3_test)
#print(pred2)
print(Metrics.r_2_score(pred1, y3_test))
#print(Metrics.r_2_score(pred2, y3_test))
#-1523081.8375414188
#-1496222.53845041
#-0.008642139528650672

0.8846343308078523


In [59]:
class MyRidge:
    def __init__(self, iterations =1000, learning_rate = 1.2*1e-16, alpha = 2):
        self.lr = learning_rate  
        self.i = iterations        
        self.alpha = alpha
    def transform(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)  #добавление к матрице столбца с единицами

    def loss_func(self, x, y, w):
        return sum((np.dot(x, w) - y) ** 2)/x.shape[1] #функция потерь sum((<w, x> - y)^2) * 1/len(x)
    
    def gradient(self, X, y, XT, l, w):
        return w - self.lr * ( 2 *np.dot(XT, np.dot(X, w) - y) + self.alpha * sum(np.abs(w))) / l #функция для вычисления градиента
                                                                                               #w -n(∇Q(w, X) + a * sum(|w|)) = w - 2n/l * (X.T(Xw - y) + a * sum(w^2))

    def fit(self, x, y):
        loss_abs = np.inf #переменная, отвечающая за прекращение работы алгоритма при минимальном изменении знач. функции потерь 
        eps = 1e-20
        
        X = self.transform(x)
        w = np.zeros(X.shape[1]) #изначально веса = 0

        iter = 0
        XT = X.T
        l = X.shape[1]
        while iter <= self.i:
            if (iter % 100 == 0):
                loss = self.loss_func(X, y, w)
                w = self.gradient(X, y, XT, l, w)
                loss_abs = np.abs(loss - self.loss_func(X, y,w))
                if(loss_abs <= eps):
                    break
            else:
                w = self.gradient(X, y, XT, l, w)
            iter += 1
            
        self.w = w
        
        
    def coef(self):
        return self.w
    
    
    def predict(self, x):
        return np.dot(self.transform(x), self.w) #ожидаемые y

In [60]:
print(pr.coef_)
mlrr = MyRidge(iterations = 1000, learning_rate = 7.02*1e-18, alpha = 1)
mlrr.fit(X1_train, y1_train)


[ 9.62274759e+02 -8.54688002e+08 -1.03356501e+09 ...  0.00000000e+00
  0.00000000e+00 -4.38519146e+06]


In [57]:
pred1 = mlrr.predict(X_test_p)
print(pred1)
pred2 = pr.predict(X_test_p)
print(pred2)
print(Metrics.r_2_score(pred1, y_test))
print(Metrics.r_2_score(pred2, y_test))

[1.50633668e+12 1.50578610e+12 1.50486380e+12 ... 1.50635190e+12
 1.50503150e+12 1.50437524e+12]
[ 954.35742188  732.35742188 1388.35742188 ...  870.35742188  547.35742188
  318.35742188]
-3598968.166666122
0.12196524674561215


In [61]:
class LinerReg:
    def __init__(self, lr = 1e-4, iters = 20000):
        self.lr = lr
        self.iters = iters

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)
    
    def loss_func(self, x, y, w):
        return np.mean((y - np.dot(x, w)) ** 2)
    
    def fit(self, x, y):
        dist = np.inf
        eps = 1e-4
        X = self.transform_(x)
        
        w = np.zeros(X.shape[1])
        iter = 0
        
        while dist > eps and iter <= self.iters:
            loss = self.loss_func(X, y, w)
            w = w - self.lr * 2 * np.dot(X.T, np.dot(X, w) - y) / X.shape[0]
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
        print(iter)
        self.w = w

    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [68]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=230, n_features=1, noise=30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
mlr = LinerReg()
mlr.fit(X_train, y_train)

20001


In [70]:
pred1 = mlr.predict(X_test)
print(pred1)

print(Metrics.r_2_score(pred1, y_test))


[  47.13140226  -86.91333037  -43.95908027   25.14757948   27.9340972
   63.46813822    4.60706389   43.13809128  150.05276805  -11.80805981
   31.82621891  -22.37874104    4.6760138   -30.34299429   18.2489082
   11.0025241   -58.91037751   67.18972922   49.28905834  -64.39842012
  -33.09543936   12.83100884   55.5279751     2.31736202  -28.21800902
 -114.43853927  -37.37942745   10.80152953   -7.0037157    66.43672345
   78.51671337  -81.33808939  -21.85189228  -80.56803675   54.31205995
   11.06655467   96.76722113   24.95898435  -13.10490939  101.99615308
   55.20238424  -24.22429407   53.78822249   65.8566468   -43.2048544
   44.67387896  -54.46948121  -49.44806794   17.54021185   36.14426526
  -59.44184314   57.16359955   25.20106748    2.36802801   40.39703285
   77.09164312  -21.99496036 -111.56797861   98.27459226    2.0529655
  -40.99772917  -25.95710242   64.45065352  -99.91229552   29.1867534
   20.89894419  -17.43932724   70.62119333   16.33630462]
0.5516111674658459


In [71]:
class Ridge:
    def __init__(self, lr = 1e-4, iters = 20000):
        self.lr = lr
        self.iters = iters

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)
    
    def loss_func(self, x, y, w):
        return (np.mean((y - np.dot(x, w)) ** 2) + (self.alpha*sum(w**2)))
    
    def fit(self, x, y, alpha):
        self.alpha = alpha
        dist = np.inf
        eps = 1e-4
        X = self.transform_(x)
        
        w = np.zeros(X.shape[1])
        iter = 0
        
        while dist > eps and iter <= self.iters:
            loss = self.loss_func(X, y, w)
            w = w - self.lr * (2 * ((np.dot(X.T, np.dot(X, w) - y)) / X.shape[0]) + (self.alpha * sum(2 * w)))
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
        print(iter)
        self.w = w

    
    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [78]:
mlr = Ridge()
mlr.fit(X_train, y_train, 1)

4080


In [79]:
pred1 = mlr.predict(X_test)
print(pred1)

print(Metrics.r_2_score(pred1, y_test))

[ 11.94679192 -44.92342649 -26.69952812   2.61986965   3.80208597
  18.87786429  -6.09471082  10.25257789  55.61251782 -13.0590402
   5.45336915 -17.54378897  -6.06545792 -20.92272683  -0.30698123
  -3.38135378 -33.04281012  20.4567976   12.86220553 -35.37118346
 -22.0904874   -2.60559538  15.50914698  -7.06614657 -20.02117424
 -56.60135379 -23.90802497  -3.46662833 -11.02073483  20.13732512
  25.26241771 -42.55805811 -17.32026655 -42.23135328  14.99327919
  -3.35418798  33.00543261   2.53985571 -13.60924547  35.22387489
  15.37101083 -18.32678881  14.77103435  19.89122007 -26.379538
  10.90415575 -31.1587022  -29.02830241  -0.60765485   7.28535638
 -33.26829131  16.2030819    2.64256262  -7.04465086   9.08964825
  24.65781315 -17.38096504 -55.38348115  33.64495439  -7.17832022
 -25.44313646 -19.06195525  19.29470916 -50.43840625   4.33354163
   0.81733095 -15.44817853  21.91264081  -1.11842813]
-3.5476191852524295
