In [50]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

## Импортирование и подготовка данных

In [51]:
data = pd.read_csv('kuala_clean.csv')
data = data.drop('Unnamed: 0', axis=1)
data = data[:5000]    # сэмпл 5000, чтобы пример быстрее работал
data.head(3)

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,Class
0,KLCC,1250000.0,2+1,3.0,2.0,Serviced Residence,1335.0,Fully Furnished,Built-up
1,Dutamas,1030000.0,3,4.0,2.0,Condominium (Corner),1875.0,Partly Furnished,Built-up
2,Bukit Jalil,900000.0,4+1,3.0,2.0,Condominium (Corner),1513.0,Partly Furnished,Built-up


- Датасет - цены на жильё в Куала-Лумпуре вместе с параметрами квартир.
- Модель предсказывает цену по параметрам.

In [52]:
X = data.drop('Price', axis=1)
y = data['Price']

In [53]:
np.random.seed(69)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Location', 'Rooms', 'Property Type', 'Furnishing', 'Class']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                   one_hot,
                                   categorical_features)],
                                   remainder='passthrough')

transformed_X = transformer.fit_transform(X)

## Оценка предсказаний

In [54]:
# кросс-валидация

np.random.seed(47)
model = RandomForestRegressor()
cross_val_score(model, transformed_X, y, cv=5, scoring='r2').mean()

0.7843809072521262

## Оценка с другими параметрами

In [55]:
np.random.seed(47)
tuned_model = RandomForestRegressor(min_samples_leaf=2, min_samples_split=4)
cross_val_score(tuned_model, transformed_X, y, cv=5, scoring='r2').mean()

0.8095978516077112

**После изменения параметров R^2 вырос на 2,5%.**