# Линейная регрессия
__Суммарное количество баллов: 10__


In [60]:
from sklearn.datasets import make_blobs, make_moons
from sklearn.model_selection import train_test_split
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib
import copy
import pandas as pd

In [61]:
def generate_synthetic(size, dim=6, noise=0.1):
    X = np.random.randn(size, dim)
    w = np.random.randn(dim + 1)
    noise = noise * np.random.randn(size)
    y = X.dot(w[1:]) + w[0] + noise
    return X, y

#### Метрика
Для начала нужно понять, какую метрику для ошибки будем использовать. В нашем случае нам подойдет стандартная метрика RMSE. Ее и нужно реализовать.

In [62]:
def rmse(y_true, y_predicted): #np.array
    return sum(((y_true - y_predicted)**2))/len(y_true)


Теперь реализуем линейную регрессию при помощи явного решения задачи минимизации. 

#### Методы
`fit(X, y)` - решает задачу минимизации $\arg\min_{w, b}\sum ((w\cdot x + b) - y)^2$. 

`predict(X)` - строит предсказание `y` для объектов из `X`.

In [63]:
class NormalLR:
    def __init__(self):
        pass
    
    def fit(self, X, y):
        X = np.column_stack([X, [1 for i in range(len(X))]])
        self.w = np.linalg.inv(X.T@X)@X.T@y
    
    def predict(self, X):
        X = np.column_stack([X, [1 for i in range(len(X))]])
        y = np.array([])
        for el in X:
            y = np.append(y, el@self.w)
        return y

In [64]:
X, y = generate_synthetic(1024)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [65]:
regr = NormalLR()
regr.fit(X_train, y_train)
print(rmse(y_test, regr.predict(X_test)))

0.00906180839441688


1. Выполните загрузку и предобработку файлов x_boston_train и y_boston_train.
2. Разбейте x_boston_train и y_boston_train на x_train, y_train, x_test и y_test для оценки точности работы алгоритма.
3. Посчитайте метрику RMSE для моделей LR. Если необходимо, попробуйте разные наборы параметров для получения лучшего результата.
4. Постройте график зависимости ошибки от коэффициента регуляризации. 

In [66]:
def read_data(path1, path2):
    x_boston_train = np.genfromtxt(path1, delimiter=",", skip_header=1)
    y_boston_train = np.genfromtxt(path2, delimiter=",", skip_header=1)
    return x_boston_train[:, 0:-1], y_boston_train[:, 1]
    

In [67]:
X_boston_train , y_boston_train  = read_data('/Users/vasilijdronov/Machine learning/x_boston_train.csv', '/Users/vasilijdronov/Machine learning/y_boston_train.csv')

In [68]:
X_tr, X_t, y_tr, y_t = train_test_split(X_boston_train , y_boston_train , train_size=0.8, shuffle=False)

In [69]:
regr = NormalLR()
regr.fit(X_tr, y_tr)
print(rmse(y_t, regr.predict(X_t)))

19112503.598250713


In [70]:
#проверка, так как

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_tr, y_tr)
print(rmse(y_t, reg.predict(X_t)))

19112503.598250702


In [71]:
test = pd.read_csv('/Users/vasilijdronov/Machine learning/x_boston_test.csv')
test

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,Id
0,0.10793,0.0,8.56,0,0.520,6.195,54.4,2.7778,5,384.0,20.9,393.49,13.00,0
1,9.51363,0.0,18.10,0,0.713,6.728,94.1,2.4961,24,666.0,20.2,6.68,18.71,1
2,7.02259,0.0,18.10,0,0.718,6.006,95.3,1.8746,24,666.0,20.2,319.98,15.70,2
3,0.03615,80.0,4.95,0,0.411,6.630,23.4,5.1167,4,245.0,19.2,396.90,4.70,3
4,12.24720,0.0,18.10,0,0.584,5.837,59.7,1.9976,24,666.0,20.2,24.65,15.69,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.08387,0.0,12.83,0,0.437,5.874,36.6,4.5026,5,398.0,18.7,396.06,9.10,97
98,0.11132,0.0,27.74,0,0.609,5.983,83.5,2.1099,4,711.0,20.1,396.90,13.35,98
99,0.31827,0.0,9.90,0,0.544,5.914,83.2,3.9986,4,304.0,18.4,390.70,18.33,99
100,9.91655,0.0,18.10,0,0.693,5.852,77.8,1.5004,24,666.0,20.2,338.16,29.97,100


In [72]:
for_pred = test.copy()
del for_pred['Id']
#for_pred = np.array(for_pred)
for_pred

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.10793,0.0,8.56,0,0.520,6.195,54.4,2.7778,5,384.0,20.9,393.49,13.00
1,9.51363,0.0,18.10,0,0.713,6.728,94.1,2.4961,24,666.0,20.2,6.68,18.71
2,7.02259,0.0,18.10,0,0.718,6.006,95.3,1.8746,24,666.0,20.2,319.98,15.70
3,0.03615,80.0,4.95,0,0.411,6.630,23.4,5.1167,4,245.0,19.2,396.90,4.70
4,12.24720,0.0,18.10,0,0.584,5.837,59.7,1.9976,24,666.0,20.2,24.65,15.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.08387,0.0,12.83,0,0.437,5.874,36.6,4.5026,5,398.0,18.7,396.06,9.10
98,0.11132,0.0,27.74,0,0.609,5.983,83.5,2.1099,4,711.0,20.1,396.90,13.35
99,0.31827,0.0,9.90,0,0.544,5.914,83.2,3.9986,4,304.0,18.4,390.70,18.33
100,9.91655,0.0,18.10,0,0.693,5.852,77.8,1.5004,24,666.0,20.2,338.16,29.97


In [73]:
my_model = NormalLR()
my_model.fit(X_boston_train , y_boston_train)
pred = my_model.predict(for_pred)



Обучите модель на всех данных из x_boston_train и y_boston_train.
Сделайте submit своего решения и получите значение RMSE_score не более 5000

In [74]:
submission = pd.DataFrame(columns = ["Id", "Expected"])
submission["Id"] = test["Id"]
submission["Expected"] = pred
submission.to_csv('submission.csv', index=False)