In [2]:
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import numpy as np

ModuleNotFoundError: No module named 'catboost'

In [3]:
df = pd.read_csv('train.csv', index_col=0)
df

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

df = df.drop_duplicates()
df = df.dropna()

# Добавление новых фичей
df['avg_room_sq'] = df['full_sq'] / df['num_room']
df['log_avg_sport_km'] = np.log((df['fitness_km'] + df['stadium_km'] + df['basketball_km']) / 3)
df['avg_cafe_distance'] = (df['cafe_sum_1000_min_price_avg'] + df['cafe_sum_1500_min_price_avg']) / 2
df['avg_culture_km'] = (df['theater_km'] + df['exhibition_km']) / 2
df['build_material_idx'] = df['build_count_brick'] / df['build_count_block']

df['log_full_sq'] = np.log(df['full_sq'])
df['log_ttk_km'] = np.log(df['ttk_km'])

df.drop(columns=['fitness_km', 'stadium_km', 'basketball_km', 'theater_km', 'exhibition_km', 'full_sq', 'ttk_km', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1000_min_price_avg'], inplace=True)

In [None]:
target = 'price_doc'
X = df.drop(target, axis=1)
y = df[target]

X = X.dropna()
y = y.loc[X.index]

In [None]:
pool = Pool(data=X, label=y)
params = {
    'iterations': [500, 1000],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'loss_function': ['RMSE']
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

model = CatBoostRegressor()

grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Лучшая модель
best_model = grid_search.best_estimator_

# Обучение лучшей модели
best_model.fit(train_pool)

# Предсказание на тестовых данных
y_pred = best_model.predict(test_pool)

# Оценка модели
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Вывод результатов
print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Корень из среднеквадратичной ошибки (RMSE): {rmse:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

# Визуализация результатов кросс-валидации
cv_data = cv(pool, grid_search.best_params_, fold_count=5, plot=True)
plt.figure(figsize=(12, 6))
plt.plot(cv_data['iterations'], cv_data['test-RMSE-mean'], label='Test RMSE')
plt.fill_between(cv_data['iterations'],
                 cv_data['test-RMSE-mean'] - cv_data['test-RMSE-std'],
                 cv_data['test-RMSE-mean'] + cv_data['test-RMSE-std'],
                 alpha=0.2, color='b')
plt.xlabel('Iterations')
plt.ylabel('Test RMSE')
plt.title('CatBoost CV - Test RMSE')
plt.legend()
plt.show()

In [None]:
y_pred.mean()

In [None]:
submission = pd.read_csv('submission.csv')
submission

In [None]:
test_df = pd.read_csv('test.csv', index_col=0)


# Добавление новых фичей
test_df['mean_room_sq'] = test_df['full_sq'] / test_df['num_room']
test_df['log_avg_sport_km'] = np.log((test_df['fitness_km'] + test_df['stadium_km'] + test_df['basketball_km']) / 3)
test_df['avg_cafe_distance'] = (test_df['cafe_sum_1000_min_price_avg'] + test_df['cafe_sum_1500_min_price_avg']) / 2
test_df['mean_culture_km'] = (test_df['theater_km'] + test_df['exhibition_km']) / 2
test_df['build_material_idx'] = test_df['build_count_brick'] / test_df['build_count_block']

test_df['log_full_sq'] = np.log(df['full_sq'])
test_df['log_ttk_km'] = np.log(df['ttk_km'])

test_df.drop(columns=['fitness_km', 'stadium_km', 'basketball_km', 'theater_km', 'exhibition_km', 'full_sq', 'ttk_km', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1000_min_price_avg'], inplace=True)

test_preds = best_model.predict(test_df[X.columns])
submission['price_doc'] = test_preds
submission = submission.rename(columns={'Unnamed: 0': 'id'})
print(len(test_preds))
print(submission.shape)
submission.to_csv('submission.csv')
submission