In [None]:
import pandas as pd
import numpy as np
import pickle

from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE

import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
test_file = './test.csv'

In [None]:
train_file= './train.csv'

In [None]:
test_df = pd.read_csv(test_file)

In [None]:
train_df = pd.read_csv(train_file)

In [None]:
test_df.info()

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df['Id'] = train_df['Id'].astype('str')
train_df['Id'].dtype

## Обзор количественных переменных

In [None]:
train_num_features = train_df.select_dtypes(include=['float64', 'int64'])
train_num_features.describe()

In [None]:
train_num_features.hist(figsize=(10,10), bins=30);

In [None]:
plt.figure(figsize = (10, 6))

sns.heatmap(train_num_features.corr(), annot=True, linewidths=.5, cmap='YlGnBu')

plt.title('Correlation matrix')
plt.show()

### Обработка пропусков

In [None]:
train_df.isna().sum(axis=0)

In [None]:
# Заменяем отсутствующие значения LifeSquare на средние
train_df.loc[train_df['LifeSquare'].isna(), 'LifeSquare'] = train_df['LifeSquare'].mean()

In [None]:
train_df['Healthcare_1'].describe()

In [None]:
# Изучаем Healthcare_1
train_df['Healthcare_1'].hist()
plt.ylabel('count')
plt.xlabel('Healthcare_1')
plt.title('Distribution of Healthcare_1')
plt.show()

In [None]:
# Заменяем Healthcare_1 на среднее
train_df.loc[train_df['Healthcare_1'].isna(), 'Healthcare_1'] = train_df['Healthcare_1'].mean()

### Поиск и обработка выбросов

In [None]:
# Дома без комнат
id_for_test = train_df.loc[train_df['Rooms'] <= 0]['Id']
train_df[train_df['Rooms'] <= 0]

In [None]:
# Меняем на mode()
train_df.loc[train_df['Rooms'] <= 0, 'Rooms'] = train_df['Rooms'].mode()[0]

In [None]:
# Прверяем
train_df.loc[train_df['Id'].isin(id_for_test)]

In [None]:
# Жилая площадь больше общей
id_for_test = train_df.loc[train_df['Square'] < train_df['LifeSquare']]['Id']
train_df.loc[train_df['Square'] < train_df['LifeSquare']]

In [None]:
# Меняем местами столбцы, где жилая площадь больше общей
sq = train_df.loc[train_df['Square'] < train_df['LifeSquare'], ['LifeSquare', 'Square']]
sq.rename(columns={'LifeSquare': 'Square', 'Square': 'LifeSquare'}, inplace=True)
train_df.loc[train_df['Square'] < train_df['LifeSquare'], ['Square', 'LifeSquare']] = sq

In [None]:
# Проверяем
train_df.loc[train_df['Id'].isin(id_for_test)]

In [None]:
# Общая площадь меньше кухни
id_for_test = train_df.loc[train_df['Square'] < train_df['KitchenSquare']]['Id']
train_df[train_df['Square'] < train_df['KitchenSquare']]

In [None]:
# Заменим площадь кухни на остаток от общей-жилой
train_df.loc[train_df['Square'] < train_df['KitchenSquare'], 'KitchenSquare'] = train_df.loc[train_df['Square'] < train_df['KitchenSquare'], 'Square']-train_df.loc[train_df['Square'] < train_df['KitchenSquare'], 'LifeSquare']

In [None]:
# Проверяем
train_df.loc[train_df['Id'].isin(id_for_test)]

In [None]:
# Ищем дома с <=0 этажей
id_for_test = train_df[train_df['HouseFloor'] <= 0]['Id']
train_df[train_df['HouseFloor'] <= 0]

In [None]:
# Предположим, что квартиры на последних этажах
train_df.loc[train_df['HouseFloor'] <= 0, 'HouseFloor'] = train_df.loc[train_df['HouseFloor'] <= 0, 'Floor']

In [None]:
# Этаж квартиры больше этажей дома
id_for_test = train_df.loc[train_df['Floor'] > train_df['HouseFloor']]['Id']
train_df.loc[train_df['Floor'] > train_df['HouseFloor']]

In [None]:
# Приведем HouseFloor к int64 и поменяем местами этажи квартир и этажность домов
train_df['HouseFloor'] = train_df['HouseFloor'].astype('int64')
swap = train_df.loc[train_df['HouseFloor'] < train_df['Floor'], ['Floor', 'HouseFloor']]
swap.rename(columns={'HouseFloor': 'Floor', 'Floor': 'HouseFloor'}, inplace=True)
train_df.loc[train_df['HouseFloor'] < train_df['Floor'], ['Floor', 'HouseFloor']] = swap

In [None]:
# Проверяем
train_df.loc[train_df['Id'].isin(id_for_test)]

In [None]:
# Проверка года постройки
train_df['HouseYear'].value_counts()

In [None]:
train_df.loc[(train_df['HouseYear'] < 1900) | (train_df['HouseYear'] > 2020)]

In [None]:
train_df.loc[train_df['Id'] == 10814, 'HouseYear'] = 2005
train_df.loc[train_df['Id'] == 11607, 'HouseYear'] = 1968

In [None]:
train_df

### Подготовка 

In [None]:
X_train = train_df.drop(['Id','Price'], axis=1)

In [None]:
X_train = pd.get_dummies(X_train)
X_train.info()

In [None]:
y_train = train_df['Price']
y_train

### Обработка test_df

In [None]:
test_df.info()

In [None]:
test_df.loc[test_df['LifeSquare'].isna(), 'LifeSquare'] = train_df['LifeSquare'].mean()

In [None]:
test_df.loc[test_df['Healthcare_1'].isna(), 'Healthcare_1'] = train_df['Healthcare_1'].mean()

In [None]:
test_ids = test_df['Id']

In [None]:
test_df.drop('Id', axis=1, inplace=True)

In [None]:
test_df

In [None]:
X_test = pd.get_dummies(test_df)
X_test.info()

### Стандартизация признаков

In [None]:
scaler = StandardScaler()

In [None]:
features_for_scale = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
features_for_scale

In [None]:
# Стандартизируем train
X_train_scaled = scaler.fit_transform(X_train[features_for_scale])
X_train_scaled

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features_for_scale)

In [None]:
X_train[features_for_scale] = X_train_scaled
X_train

In [None]:
# Стандартизируем test
X_test_scaled = scaler.transform(X_test[features_for_scale])
X_test_scaled

In [None]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features_for_scale)
X_test_scaled

In [None]:
X_test[features_for_scale] = X_test_scaled
X_test

## Построение модели

In [None]:
def evaluate_preds(true_values, pred_values):
    print("R2:\t" + str(round(r2(true_values, pred_values), 3)) + "\n" +
          "MAE:\t" + str(round(mae(true_values, pred_values), 3)) + "\n" +
          "MSE:\t" + str(round(mse(true_values, pred_values), 3)))

In [None]:
parameters = [{'n_estimators': [100, 500, 1000],
               'max_depth': np.arange(5,12)}]

In [None]:
gs = GridSearchCV(estimator=RandomForestRegressor,
                  param_grid=parameters,
                  scoring='r2',
                  cv=5,
                  n_jobs=-1)

In [None]:
rf = RandomForestRegressor(n_estimators=2000, max_depth=18, random_state=100)

In [None]:
gs.best_params_

In [None]:
rf.fit(X_train.drop('Healthcare_1', axis=1), y_train)

In [None]:
y_train_pred = rf.predict(X_train.drop('Healthcare_1', axis=1))
evaluate_preds(y_train.values.flatten(), y_train_pred.flatten())

In [None]:
y_test_pred = rf.predict(X_test.drop('Healthcare_1', axis=1))
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['Price'] = y_test_pred
submission

In [None]:
submission.to_csv('./submission_cluster.csv', index=False)

In [None]:
with open('./minmax.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
with open('./rfr_model_minmax_16_210.pkl', 'wb') as file:
    pickle.dump(rf, file)