# Задача 1

**Загрузите датасет для регрессии, выполните предварительную обработку (удалите пропуски, приведите все признаки к числам), выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку.**

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt, pow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from datetime import datetime as dt
import folium

- **Загрузка датасета**

In [2]:
df = pd.read_csv('../data/trip_duration_task.csv')

df

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,id1080784,2.0,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,400
1,id0889885,1.0,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,1100
2,id0857912,2.0,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,1635
3,id3744273,2.0,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,1141
4,id0232939,1.0,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,848
...,...,...,...,...,...,...,...,...,...,...
729317,id3905982,2.0,2016-05-21 13:29:38,2016-05-21 13:34:34,2,-73.965919,40.789780,-73.952637,40.789181,296
729318,id0102861,1.0,2016-02-22 00:43:11,2016-02-22 00:48:26,1,-73.996666,40.737434,-74.001320,40.731911,315
729319,id0439699,1.0,2016-04-15 18:56:48,2016-04-15 19:08:01,1,-73.997849,40.761696,-74.001488,40.741207,673
729320,id2078912,1.0,2016-06-19 09:50:47,2016-06-19 09:58:14,1,-74.006706,40.708244,-74.013550,40.713814,447


## Предобработка датасета

- **Удаление столбцов id, dropoff_datetime(т.к. является искомым)**

In [3]:
df1 = df.drop(["id", "dropoff_datetime"], axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   vendor_id          727135 non-null  float64
 1   pickup_datetime    729322 non-null  object 
 2   passenger_count    729322 non-null  int64  
 3   pickup_longitude   729322 non-null  float64
 4   pickup_latitude    727475 non-null  float64
 5   dropoff_longitude  729322 non-null  float64
 6   dropoff_latitude   729322 non-null  float64
 7   trip_duration      729322 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 44.5+ MB


- **Исключение пустых строк**

In [4]:
df2 = df1.dropna()

- **Отбор основных путей перездок для улучшения результата прогноза**

In [5]:
dfl = df.head(500)
m = folium.Map(location=[40.753093719482415, -73.97315979003906], zoom_start=12)
result = [folium.Marker([x1, y1]).add_to(m) and folium.Marker([x2, y2]).add_to(m) for x1, y1, x2, y2 in zip(dfl['pickup_latitude'], dfl['pickup_longitude'], dfl['dropoff_latitude'], dfl['dropoff_longitude'])]
m.save("name1.html")
#40.691510, -74.020491
#40.820779, -73.919535

df3 = df2[(40.691510 < df2.pickup_latitude) & (40.691510 < df2.dropoff_latitude) & (df2.pickup_latitude < 40.820779) & (df2.dropoff_latitude < 40.820779)]
df3 = df3[(-74.020491 < df3.pickup_longitude) & (-74.020491 < df3.dropoff_longitude) & (df3.pickup_longitude < -73.919535) & (df3.dropoff_longitude < -73.919535)]

![plot](./loc.png)

- **Выделение прзнака из даты отправки - часовой период, из кол-ва пассажиров - 2 типа вместительности; удаление столбца vendor_idt**

In [6]:
df4 = df3
df_hours = pd.get_dummies(df4["pickup_datetime"].apply(lambda date: int(date[11:13])), prefix="hour")
df4["hour_4_9"] = df_hours.hour_4 + df_hours.hour_5 + df_hours.hour_6 + df_hours.hour_7 + df_hours.hour_8 + df_hours.hour_9
df4["hour_10_15"] = df_hours.hour_10 + df_hours.hour_11 + df_hours.hour_12 + df_hours.hour_13 + df_hours.hour_14 + df_hours.hour_15
df4["hour_16_15"] = df_hours.hour_16 + df_hours.hour_17 + df_hours.hour_18 + df_hours.hour_19 + df_hours.hour_20 + df_hours.hour_21
df4["hour_night"] = df_hours.hour_22 + df_hours.hour_23 + df_hours.hour_0 + df_hours.hour_1 + df_hours.hour_2 + df_hours.hour_3

print(df4["passenger_count"].unique())
df4["pas0-3"] = df4["passenger_count"].apply(lambda count: 1 if count < 4 else 0)
df4["pas4-6"] = df4["passenger_count"].apply(lambda count: 1 if count > 3 else 0)

df4 = df4.drop(["pickup_datetime", "vendor_id", "passenger_count"], axis=1)
df4.info()

[1 2 6 3 4 5 0]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 629577 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_longitude   629577 non-null  float64
 1   pickup_latitude    629577 non-null  float64
 2   dropoff_longitude  629577 non-null  float64
 3   dropoff_latitude   629577 non-null  float64
 4   trip_duration      629577 non-null  int64  
 5   hour_4_9           629577 non-null  uint8  
 6   hour_10_15         629577 non-null  uint8  
 7   hour_16_15         629577 non-null  uint8  
 8   hour_night         629577 non-null  uint8  
 9   pas0-3             629577 non-null  int64  
 10  pas4-6             629577 non-null  int64  
dtypes: float64(4), int64(3), uint8(4)
memory usage: 40.8 MB


- **Избавляемся от слишком завышенных параметров времени поездки**

In [7]:
df5 = df4
print(df5[df5["trip_duration"]>8000].shape[0])
df5 = df5[df5["trip_duration"]<8000]
df5.info()

838
<class 'pandas.core.frame.DataFrame'>
Int64Index: 628739 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_longitude   628739 non-null  float64
 1   pickup_latitude    628739 non-null  float64
 2   dropoff_longitude  628739 non-null  float64
 3   dropoff_latitude   628739 non-null  float64
 4   trip_duration      628739 non-null  int64  
 5   hour_4_9           628739 non-null  uint8  
 6   hour_10_15         628739 non-null  uint8  
 7   hour_16_15         628739 non-null  uint8  
 8   hour_night         628739 non-null  uint8  
 9   pas0-3             628739 non-null  int64  
 10  pas4-6             628739 non-null  int64  
dtypes: float64(4), int64(3), uint8(4)
memory usage: 40.8 MB


- **Выделение целевого признака и предикатов.**

In [8]:
y = df5['trip_duration']
X = df5.drop(['trip_duration'], axis = 1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((188621, 10), (188621,), (440118, 10), (440118,))

# Задача 2

**Решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр.**

- **!Подготовка нескольких гиперпараметров**

In [11]:
parameters = {'alpha': np.arange(0.1, 1, 0.05)}

- **Линейная регрессия.**

In [12]:
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [13]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr = lr.score(X_test, y_test)
print(f'R^2: {score_lr}')

MAE: 340.9889903105531
MSE: 202808.61893468324
RMSE: 450.34277937442636
MAPE: 1.0139020367349532
R^2: 0.02959874216632663


In [14]:
lr.coef_

array([-2.08640988e+03, -1.32181518e+02,  2.78158111e+03, -2.71781275e+03,
       -5.22193147e+14, -5.22193147e+14, -5.22193147e+14, -5.22193147e+14,
        3.72818860e+14,  3.72818860e+14])

- **Линейная регрессия. Регуляризация L1 / Ridge.**

In [15]:
# L1(RandomizedSearchCV)
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge = Ridge(alpha=ridge_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = ridge.predict(X_test)


In [16]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L1_RandomSearchCV = ridge.score(X_test, y_test)
print(f'R^2: {score_lr_L1_RandomSearchCV}')

MAE: 340.9799806928698
MSE: 202803.33999803013
RMSE: 450.336918315643
MAPE: 1.0139346883744897
R^2: 0.029624000889526547


In [17]:
ridge.coef_

array([-2075.11439083,  -141.77147048,  2764.53126776, -2705.63131971,
         -32.38859288,    82.07020624,    14.51628233,   -64.19789562,
          -3.04774238,     3.04774238])

In [18]:
# L1(GridSearchCV)
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge = Ridge(alpha=ridge_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [19]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L1_GridSearchCV = ridge.score(X_test, y_test)
print(f'R^2: {score_lr_L1_GridSearchCV}')

MAE: 340.9799806928698
MSE: 202803.33999803013
RMSE: 450.336918315643
MAPE: 1.0139346883744897
R^2: 0.029624000889526547


In [20]:
ridge.coef_

array([-2075.11439083,  -141.77147048,  2764.53126776, -2705.63131971,
         -32.38859288,    82.07020624,    14.51628233,   -64.19789562,
          -3.04774238,     3.04774238])

- **Линейная регрессия. Регуляризация L2 / Lasso.**

In [21]:
# L2(RandomizedSearchCV)
lasso_optimal = RandomizedSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso = Lasso(alpha=lasso_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [22]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L2_RandomSearchCV = ridge.score(X_test, y_test)
print(f'R^2: {score_lr_L2_RandomSearchCV}')

MAE: 341.21275887334497
MSE: 203035.03886063452
RMSE: 450.594095456914
MAPE: 1.0146452253262646
R^2: 0.029624000889526547


In [23]:
lasso.coef_

array([-1353.76774794,  -338.3196697 ,  1784.0358348 , -2138.17607937,
         -26.14173871,    87.25348486,    21.27248489,   -52.10093994,
          -4.92951148,     0.        ])

In [24]:
# L2(GridSearchCV)
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso = Lasso(alpha=lasso_optimal.best_params_['alpha']).fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [25]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_lr_L2_GridSearchCV = ridge.score(X_test, y_test)
print(f'R^2: {score_lr_L2_GridSearchCV}')

MAE: 341.21275887334497
MSE: 203035.03886063452
RMSE: 450.594095456914
MAPE: 1.0146452253262646
R^2: 0.029624000889526547


In [26]:
lasso.coef_

array([-1353.76774794,  -338.3196697 ,  1784.0358348 , -2138.17607937,
         -26.14173871,    87.25348486,    21.27248489,   -52.10093994,
          -4.92951148,     0.        ])

# Задача 3

**Решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр.**

- **Добавление предиката, степень полинома 2.**

In [45]:
pf = PolynomialFeatures(4)
X_train_p = pf.fit_transform(X_train)
X_test_p = pf.fit_transform(X_test)


- **Полиномиальная регрессия.**

In [46]:
pr = LinearRegression().fit(X_train_p, y_train)
y_pred = pr.predict(X_test_p)


In [47]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_pr = pr.score(X_test_p, y_test)
print(f'R^2: {score_pr}')

MAE: 217.9490247967768
MSE: 97903.28288415968
RMSE: 312.89500297090024
MAPE: 0.7509673973740372
0.12196524674561215
R^2: 0.5315511275808644


In [30]:
pr.coef_

array([ 9.62274759e+02, -8.54688002e+08, -1.03356501e+09, ...,
        0.00000000e+00,  0.00000000e+00, -4.38519146e+06])

- **Полиномиальная регрессия. Регуляризация L1 / Ridge.**

In [31]:
ridge = Ridge().fit(X_train_p, y_train)
y_pred = ridge.predict(X_test_p)

In [32]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
score_pr_L1 = ridge.score(X_test_p, y_test)
print(f'R^2: {score_pr_L1}')

MAE: 241.3421293506089
MSE: 116399.9035349563
RMSE: 341.1743008125851
MAPE: 0.8142651581742988
R^2: 0.4430482619754039


# Задача 4

**Вычислите значения метрик $R^2$, MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель.**

- **Сравнение всех моделей по наибольшему коэффициенту детерминации.**

In [33]:
print(score_lr)
print(score_lr_L1_RandomSearchCV)
print(score_lr_L1_GridSearchCV)
print(score_lr_L2_RandomSearchCV)
print(score_lr_L2_GridSearchCV)
print("----")
print(score_pr)
print(score_pr_L1)

0.02959874216632663
0.029624000889526547
0.029624000889526547
0.029624000889526547
0.029624000889526547
----
0.5315511275808644
0.4430482619754039


**Наилучшей моделью оказалась: Полиномиальная регрессия 4 степени**

# Задача 5
- **Самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2).**
- **Самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента).**
- **Обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.**

In [38]:
class Metrics:
    @staticmethod
    def  mean_absolute_error(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs(real - pred)))
    @staticmethod
    def mean_squared_error(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(np.mean((real - pred) ** 2))
    @staticmethod
    def root_mean_squared_error(y_test, y_pred):
        return float(np.sqrt(Metrics.mean_squared_error(y_test, y_pred)))
    @staticmethod
    def mean_absolute_percentage_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((real - pred) / np.abs(real)))
    @staticmethod
    def r_2_score(y_test, y_pred):
        real, pred = np.array(y_test), np.array(y_pred)
        return float(1 - np.sum((real - pred)**2) / np.sum((real - np.mean(real))**2))
        #return ((real - pred), list(real-pred)-1,  np.sum((real - np.mean(real))**2))
    #Metrics.r_2_score([1, 1, 3], [1, 2, 3])

In [48]:
class MyLinearRegression:
    def __init__(self, iterations =1000, learning_rate = 1.2*1e-16):
        self.lr = learning_rate  
        self.i = iterations        

    def transform(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)  #добавление к матрице столбца с единицами

    def loss_func(self, x, y, w):
        return sum((np.dot(x, w) - y) ** 2)/x.shape[1] #функция потерь sum((<w, x> - y)^2) * 1/len(x)
    
    def gradient(self, X, y, XT, l, w):
        return w - self.lr * 2 * np.dot(XT, np.dot(X, w) - y) / l #функция для вычисления градиента
                                                                  #w -n∇Q(w, X) = w - 2n/l * X.T(Xw - y)

    def fit(self, x, y):
        loss_abs = np.inf #переменная, отвечающая за прекращение работы алгоритма при минимальном изменении знач. функции потерь 
        eps = 1e-20
        
        X = self.transform(x)
        w = np.zeros(X.shape[1]) #изначально веса = 0
        iter = 0
        XT = X.T
        l = X.shape[0]
        while iter <= self.i:
            if (iter % 100 == 0):
                loss = self.loss_func(X, y, w)
                w = self.gradient(X, y, XT, l, w)
                loss_abs = np.abs(loss - self.loss_func(X, y,w))
                if(loss_abs <= eps):
                    break
            else:
                w = self.gradient(X, y, XT, l, w)
            iter += 1
            
        self.w = w
        
        
    def predict(self, x):
        return np.dot(self.transform(x), self.w) #ожидаемые y

In [49]:
(X1_train, 
 X2_train, 
 y1_train, y2_train) = train_test_split(X_train_p, y_train, 
                                     test_size=0.9, 
                                     random_state=0)

In [50]:
print(pr.coef_)

mlr = MyLinearRegression(iterations =2000, learning_rate = 7*1e-18)
mlr.fit(X1_train, y1_train)

[ 9.62274759e+02 -8.54688002e+08 -1.03356501e+09 ...  0.00000000e+00
  0.00000000e+00 -4.38519146e+06]


In [51]:
pred1 = mlr.predict(X_test_p)
print(pred1)
pred2 = pr.predict(X_test_p)
print(pred2)
print(Metrics.r_2_score(pred1, y_test))
print(Metrics.r_2_score(pred2, y_test))

[701.47618881 701.21664512 699.95871908 ... 700.65433072 700.86395798
 699.72383245]
[ 954.35742188  732.35742188 1388.35742188 ...  870.35742188  547.35742188
  318.35742188]
-809115.7352924455
0.12196524674561215


In [59]:
class MyRidge:
    def __init__(self, iterations =1000, learning_rate = 1.2*1e-16, alpha = 2):
        self.lr = learning_rate  
        self.i = iterations        
        self.alpha = alpha
    def transform(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)  #добавление к матрице столбца с единицами

    def loss_func(self, x, y, w):
        return sum((np.dot(x, w) - y) ** 2)/x.shape[1] #функция потерь sum((<w, x> - y)^2) * 1/len(x)
    
    def gradient(self, X, y, XT, l, w):
        return w - self.lr * ( 2 *np.dot(XT, np.dot(X, w) - y) + self.alpha * sum(np.abs(w))) / l #функция для вычисления градиента
                                                                                               #w -n(∇Q(w, X) + a * sum(|w|)) = w - 2n/l * (X.T(Xw - y) + a * sum(w^2))

    def fit(self, x, y):
        loss_abs = np.inf #переменная, отвечающая за прекращение работы алгоритма при минимальном изменении знач. функции потерь 
        eps = 1e-20
        
        X = self.transform(x)
        w = np.zeros(X.shape[1]) #изначально веса = 0

        iter = 0
        XT = X.T
        l = X.shape[1]
        while iter <= self.i:
            if (iter % 100 == 0):
                loss = self.loss_func(X, y, w)
                w = self.gradient(X, y, XT, l, w)
                loss_abs = np.abs(loss - self.loss_func(X, y,w))
                if(loss_abs <= eps):
                    break
            else:
                w = self.gradient(X, y, XT, l, w)
            iter += 1
            
        self.w = w
        
        
    def coef(self):
        return self.w
    
    
    def predict(self, x):
        return np.dot(self.transform(x), self.w) #ожидаемые y

In [60]:
print(pr.coef_)
mlrr = MyRidge(iterations = 1000, learning_rate = 7.02*1e-18, alpha = 1)
mlrr.fit(X1_train, y1_train)


[ 9.62274759e+02 -8.54688002e+08 -1.03356501e+09 ...  0.00000000e+00
  0.00000000e+00 -4.38519146e+06]


In [57]:
pred1 = mlrr.predict(X_test_p)
print(pred1)
pred2 = pr.predict(X_test_p)
print(pred2)
print(Metrics.r_2_score(pred1, y_test))
print(Metrics.r_2_score(pred2, y_test))

[1.50633668e+12 1.50578610e+12 1.50486380e+12 ... 1.50635190e+12
 1.50503150e+12 1.50437524e+12]
[ 954.35742188  732.35742188 1388.35742188 ...  870.35742188  547.35742188
  318.35742188]
-3598968.166666122
0.12196524674561215


In [61]:
class LinerReg:
    def __init__(self, lr = 1e-4, iters = 20000):
        self.lr = lr
        self.iters = iters

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)
    
    def loss_func(self, x, y, w):
        return np.mean((y - np.dot(x, w)) ** 2)
    
    def fit(self, x, y):
        dist = np.inf
        eps = 1e-4
        X = self.transform_(x)
        
        w = np.zeros(X.shape[1])
        iter = 0
        
        while dist > eps and iter <= self.iters:
            loss = self.loss_func(X, y, w)
            w = w - self.lr * 2 * np.dot(X.T, np.dot(X, w) - y) / X.shape[0]
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
        print(iter)
        self.w = w

    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [68]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=230, n_features=1, noise=30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
mlr = LinerReg()
mlr.fit(X_train, y_train)

20001


In [70]:
pred1 = mlr.predict(X_test)
print(pred1)

print(Metrics.r_2_score(pred1, y_test))


[  47.13140226  -86.91333037  -43.95908027   25.14757948   27.9340972
   63.46813822    4.60706389   43.13809128  150.05276805  -11.80805981
   31.82621891  -22.37874104    4.6760138   -30.34299429   18.2489082
   11.0025241   -58.91037751   67.18972922   49.28905834  -64.39842012
  -33.09543936   12.83100884   55.5279751     2.31736202  -28.21800902
 -114.43853927  -37.37942745   10.80152953   -7.0037157    66.43672345
   78.51671337  -81.33808939  -21.85189228  -80.56803675   54.31205995
   11.06655467   96.76722113   24.95898435  -13.10490939  101.99615308
   55.20238424  -24.22429407   53.78822249   65.8566468   -43.2048544
   44.67387896  -54.46948121  -49.44806794   17.54021185   36.14426526
  -59.44184314   57.16359955   25.20106748    2.36802801   40.39703285
   77.09164312  -21.99496036 -111.56797861   98.27459226    2.0529655
  -40.99772917  -25.95710242   64.45065352  -99.91229552   29.1867534
   20.89894419  -17.43932724   70.62119333   16.33630462]
0.5516111674658459
