In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split # функция разделения данных
from sklearn.linear_model import LinearRegression # Линейная регрессия. функции обучения
from sklearn.metrics import mean_squared_error # функция расчета среднеквадратической ошибки
import random

In [2]:
# читаем txt файл. Разделяем строки на значения отделенные пробелами.
with open('boston_housing.txt','rt',encoding='UTF-8') as file:
    content = [ string.split() for string in file.readlines()]
    

In [3]:
# формируем DataFrame, вводим названия колонок и применяем к числам тип данных float64
DF_bostonhousing = pd.DataFrame(content, columns = ['ID', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv'])
DF_bostonhousing = DF_bostonhousing.astype({'crim': 'float64', 'zn': 'float64', 'indus': 'float64', 'chas': 'float64', 'nox': 'float64', 'rm': 'float64', 'age': 'float64', 'dis': 'float64', 'rad': 'float64', 'tax': 'float64', 'ptratio': 'float64', 'black': 'float64', 'lstat': 'float64', 'medv': 'float64'})

In [4]:
DF_bostonhousing = DF_bostonhousing.set_index('ID')
DF_bostonhousing.head(2)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6


In [5]:
DF_bostonhousing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 506 entries, 1 to 506
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    float64
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    float64
 9   tax      506 non-null    float64
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(14)
memory usage: 59.3+ KB


In [6]:
DF_bostonhousing = DF_bostonhousing.reset_index(drop=True)

In [11]:
#DF_bostonhousing.isna().sum()
DF_bostonhousing.sample(3, random_state=100)
#DF_bostonhousing.head(3)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6


In [12]:
# Разделяем DataFrame на целевую колонку, которую будем "предсказывать",
# и на DataFrame с остальными колонками.
train_data_y = DF_bostonhousing['medv']
train_data_x = DF_bostonhousing.drop('medv', axis=1)


In [13]:
# Вводдим обе таблицы в функцию разделения данных на данные
# для составление модели и данные для тестирования/сравнения полученных значений
# Размер тестовой выборки 20%.

X_train, X_test, y_train, y_test = train_test_split(
    train_data_x, 
    train_data_y, 
    test_size=0.2, 
    random_state=42)

In [14]:
# осмотрим размерность обучающей и тестовой выборок
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(404, 13) (404,)
(102, 13) (102,)


In [15]:
# создаем объект линейной регрессии и обучаем модель.
LR_model = LinearRegression().fit(X_train,y_train)

In [16]:
LR_predict = LR_model.predict(X_test)

In [17]:
# совместим колонки тестовой выборки и предсказанных моделью
# для визуального осмотра
test_pred = X_test.copy()
test_pred['medv_predict'] = LR_predict
test_pred['medv_real'] = y_test
test_pred[['medv_predict','medv_real']]

Unnamed: 0,medv_predict,medv_real
173,28.996724,23.6
274,36.025565,32.4
491,14.816944,13.6
72,25.031979,22.8
452,18.769880,16.1
...,...,...
412,-0.164237,17.9
436,13.684867,9.6
411,16.183597,17.2
86,22.276220,22.5


In [27]:
# Выведем корень среднеквадратической ошибки
# и среднеарифметическую ошибку для разных 
# показателей random_state (начальных чисел генераций, 
# для повтора случайных значений)
lst_temp = []
for rand_state in [1,2,3,1,5,6,7,100, 500, 1000, 10**5, random.randint(1,10**5)]:
    X_train, X_test, y_train, y_test = train_test_split(
        train_data_x, 
        train_data_y, 
        test_size=0.2, 
        random_state=rand_state)
    LR_model = LinearRegression().fit(X_train,y_train)
    LR_predict = LR_model.predict(X_test)
    print(f'RMSE при random_state {rand_state} = {mean_squared_error(y_test, LR_predict, squared=False)}')
    lst_temp.append(mean_squared_error(y_test, LR_predict, squared=False))
print('Средняя ошибка =', pd.Series(lst_temp).mean())

RMSE при random_state 1 = 4.83537345820052
RMSE при random_state 2 = 4.300630200615768
RMSE при random_state 3 = 4.116196425564974
RMSE при random_state 1 = 4.83537345820052
RMSE при random_state 5 = 4.568292042303182
RMSE при random_state 6 = 5.21759846330634
RMSE при random_state 7 = 5.835793120808383
RMSE при random_state 100 = 4.859731895955129
RMSE при random_state 500 = 6.2899785131857255
RMSE при random_state 1000 = 5.107089714032164
RMSE при random_state 100000 = 4.386783214128476
RMSE при random_state 33914 = 4.7815637497002434
Средняя ошибка = 4.927867021333452


In [24]:
test_pred['medv_real'].max()

50.0

In [25]:
test_pred['medv_real'].min()

5.0

In [20]:
test_pred['medv_predict'].max()

42.67251161101941