In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error


In [2]:
#записываем данные из boston.txt в переменную df
data_columns = "CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT".split(" ")
raw_df = pd.read_csv('./boston.txt', sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
df = pd.DataFrame(data, columns=data_columns)
df["MEDV"] = target

In [3]:
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


Вариант с sklearn

In [30]:
#разделяем тренировочный и тестовый датасеты
X = df.drop(columns=['MEDV']).values
y = df['MEDV'].values

features_names = df.drop(columns = ['MEDV']).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler  = StandardScaler()
scaler.fit_transform(X_train)
X_train = scaler.transform(X_train) 
X_test  = scaler.transform(X_test) 

pd.DataFrame(X_train).tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
349,-0.397237,-0.505125,-0.371354,-0.281546,-0.319475,-0.68181,0.836171,-0.732385,-0.514369,-0.13857,1.163486,0.414371,0.878423
350,-0.381828,0.380752,-0.602272,-0.281546,-0.799639,-0.653963,-0.966462,0.312349,-0.746179,-1.106776,0.142897,0.333497,0.081504
351,-0.416046,3.038382,-1.316677,-0.281546,-1.048295,0.430702,-1.41531,2.156415,-0.630274,-0.763995,-0.567077,0.359879,-0.905493
352,0.926113,-0.505125,1.0055,-0.281546,1.566884,0.422348,0.933904,-0.773035,1.687825,1.557294,0.852872,-2.878413,1.527504
353,-0.390305,-0.505125,-0.371354,-0.281546,-0.319475,0.110454,0.600888,-0.49513,-0.514369,-0.13857,1.163486,-3.328288,-0.252188


In [36]:
linear_regression_model = SGDRegressor(tol=.0001, eta0=.01) 
linear_regression_model.fit(X_train, y_train)

train_predictions = linear_regression_model.predict(X_train) 
test_predictions = linear_regression_model.predict(X_test)

train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
print("Train MSE: {}".format(train_mse))
print("Test MSE: {}".format(test_mse))

Train MSE: 22.645060053303038
Test MSE: 22.06840316494181


Вариант с numpy (и без градиентного спуска)

In [48]:
#разделяем Х и у
X = df.drop(columns=['MEDV']).values
y = df['MEDV'].values

print(X.shape)
print(y.shape)

(506, 13)
(506,)


In [49]:
#добавляем столбец единиц в матирцу Х
one = np.ones((len(X),1))
X = np.append(one, X, axis=1)
#добавляем измерение в у (теперь это вектор??)
y = np.array(y).reshape((len(y),1))
print(X.shape)
print(y.shape)

(506, 14)
(506, 1)


In [52]:
#train_test_split из scikit своими руками
def train_test_split(X, y, split):

    #создаём массив индексов
    indices = np.array(range(len(X)))
    
    #вычисляем сколько элементов должно быть в train_size
    train_size = round(split * len(X))

    #/звуки блендера/
    np.random.shuffle(indices)

    #получается что train - это перемешанные индексы от нуля до скольки надо, а test - всё что осталось
    train_indices = indices[0:train_size]
    test_indices = indices[train_size:len(X)]

    #разделяем выборки
    X_train = X[train_indices, :]
    X_test = X[test_indices, :]
    y_train = y[train_indices, :]
    y_test = y[test_indices, :]
    
    return X_train, y_train, X_test, y_test

In [58]:
X_train, y_train, X_test, y_test = train_test_split(X, y, 0.7)

In [59]:
#проверяем всё ли правильно
print ("TRAINING SET")
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)

print("TESTING SET")
print("X_test.shape: ", X_test.shape)
print("y_test.shape: ", y_test.shape)

TRAINING SET
X_train.shape:  (354, 14)
y_train.shape:  (354, 1)
TESTING SET
X_test.shape:  (152, 14)
y_test.shape:  (152, 1)


In [61]:
#находим коэффициент бета (метод наименьших квадратов)
def normal_equation(X, y):
    beta = np.dot((np.linalg.inv(np.dot(X.T,X))), np.dot(X.T,y))

    return beta

#предсказываем y
def predict(X_test, beta):
    return np.dot(X_test, beta)

In [70]:
beta = normal_equation(X_train, y_train)
predictions = predict(X_test, beta)

print(predictions.shape)

(152, 1)


In [69]:
#среднеквадратичная ошибка
MSE = np.square(np.subtract(y_test,predictions)).mean()

print(MSE)

21.893940591491777
