## Install deps

In [63]:
# %pip install pandas
# %pip install category_encoders
# %pip install scikit-learn
# %pip install seaborn

## Imports

In [64]:
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
import pandas as pd

## Load dataset

In [65]:
dataframe = read_csv("clear_all_data.csv",sep=';')
dataframe.head()

Unnamed: 0,city,floor,floors_count,rooms_count,total_meters,price_per_m2,price,year_of_construction,living_meters,kitchen_meters,district
0,moscow,1,5,1,12.0,291666,3500000,2011,19.0,9.0,tekstil'schiki
1,moscow,17,41,1,40.4,618811,25000000,2021,15.2,9.1,horoshevskij
2,moscow,5,14,1,61.97,919799,57000000,2022,19.0,9.0,hamovniki
3,moscow,9,41,1,40.5,617283,25000000,2022,17.3,9.3,horoshevskij
4,moscow,12,17,1,47.6,324579,15450000,2019,19.0,10.0,nagatino-sadovniki


In [66]:
dataframe.head()

Unnamed: 0,city,floor,floors_count,rooms_count,total_meters,price_per_m2,price,year_of_construction,living_meters,kitchen_meters,district
0,moscow,1,5,1,12.0,291666,3500000,2011,19.0,9.0,tekstil'schiki
1,moscow,17,41,1,40.4,618811,25000000,2021,15.2,9.1,horoshevskij
2,moscow,5,14,1,61.97,919799,57000000,2022,19.0,9.0,hamovniki
3,moscow,9,41,1,40.5,617283,25000000,2022,17.3,9.3,horoshevskij
4,moscow,12,17,1,47.6,324579,15450000,2019,19.0,10.0,nagatino-sadovniki


In [67]:
# dataframe = pd.get_dummies(dataframe, columns=["district", "street", "underground"])
# dataframe.head()

## Define args and y

In [68]:
X = dataframe.drop("price_per_m2", axis=1)
y = dataframe.price_per_m2


In [69]:
X.shape

(8307, 10)

In [70]:
y.shape

(8307,)

## Split to train and test dataframes

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

### Encode X train and X test

In [72]:
from category_encoders import CatBoostEncoder

In [73]:
cat_features = ['district', "city"]
encoder = CatBoostEncoder()

In [74]:
X_train_encoded = X_train.copy()
X_train_encoded[cat_features] = encoder.fit_transform(X_train[cat_features], y_train)

In [75]:
X_test_encoded = X_test.copy()
X_test_encoded[cat_features] = encoder.transform(X_test[cat_features])

### Save encoded values + normal names to dicts

In [76]:
city_names_and_encoded_names = pd.DataFrame()

city_names_and_encoded_names["city_name"] = X_test["city"]
city_names_and_encoded_names["city_encoded"] = X_test_encoded["city"]

unique_citys = city_names_and_encoded_names.drop_duplicates()
city_dict = dict(zip(unique_citys["city_name"], unique_citys["city_encoded"]))

In [77]:
district_names_and_encoded_names = pd.DataFrame()

district_names_and_encoded_names["district_name"] = X_test["district"]
district_names_and_encoded_names["district_encoded"] = X_test_encoded["district"]

unique_district = district_names_and_encoded_names.drop_duplicates()
district_dict = dict(zip(unique_district["district_name"], unique_district["district_encoded"]))

### Export dicts to json

In [78]:
import json
import io
import os

In [79]:
with open("city.json", "w") as file:
    json.dump(city_dict, file)

In [80]:
with open("district.json", "w") as file:
    json.dump(district_dict, file)

## Define and train models

In [81]:
model_list = []

### Lasso regression

* ### Create CV for lasso regression model

In [82]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=5)

* ### Learn model

In [83]:
from sklearn.linear_model import LassoCV
lasso_model = LassoCV(cv=cv, n_jobs=-1, max_iter=10000)
lasso_model.fit(X_train_encoded, y_train)
model_list.append(lasso_model)

### Linear regression

In [84]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train_encoded, y_train)
model_list.append(linear_model)

### ElasticNET

In [85]:
from sklearn.linear_model import ElasticNetCV
elastic_model = ElasticNetCV(cv=cv, n_jobs=-1, max_iter=10000)
elastic_model.fit(X_train_encoded, y_train)
model_list.append(elastic_model)

### Ridge

In [86]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1e-10)
ridge_model.fit(X_train_encoded, y_train)
model_list.append(ridge_model)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


## GradientBoosting

In [87]:
from sklearn.ensemble import GradientBoostingRegressor
ridge_model = GradientBoostingRegressor(learning_rate=0.1, random_state=0, max_depth=3, n_estimators=500)
ridge_model.fit(X_train_encoded, y_train)
model_list.append(ridge_model)

## Make prediction

In [88]:
predict_list = []

for model in model_list:
    predict_list.append(model.predict(X_test_encoded))

## Plot of data

In [89]:
# from matplotlib import pyplot as plt
# from matplotlib import colors
# import pandas as pd
# import seaborn as sns
# import numpy as np

# ALL_COLORS = ["blue","orange","purple","green","gray"]

# plt.figure(figsize=(400, 150))
# sns.lineplot(x=range(len(y_test)), y=y_test.sort_values(), color='black', label='Actual')
# plt.xlabel('Index')
# plt.ylabel('Predicted Value')
# plt.title('Line Plot of Predicted Values')

# for i in range(0, len(model_list)):
#     if i == 1:
#         continue
#     y = np.sort(predict_list[i])
#     sns.lineplot(x=range(len(predict_list[i])), y=y, color=ALL_COLORS[i], label=type(model_list[i]).__name__)

# plt.show()

## Check score of model

In [90]:
print("Scores: ")
for i, model in enumerate(model_list):
    print(f"\n* {type(model_list[i]).__name__}: {model.score(X_test_encoded, y_test)*100:.2f}%\t")

Scores: 

* LassoCV: 74.12%	

* LinearRegression: 56.39%	

* ElasticNetCV: 74.07%	

* Ridge: 56.39%	

* GradientBoostingRegressor: 99.48%	


## Mean errors

In [91]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Absolute

In [92]:
print("MAE (Mean Absolute Error): ")
for i, predict in enumerate(predict_list):
    mse = mean_absolute_error(y_true=y_test, y_pred=predict)
    print(f"\n* {type(model_list[i]).__name__}: {mse:.2f}\t")

MAE (Mean Absolute Error): 

* LassoCV: 128538.20	

* LinearRegression: 154459.61	

* ElasticNetCV: 128701.89	

* Ridge: 154459.61	

* GradientBoostingRegressor: 14814.26	


### Squared

In [93]:
print("MSE (Mean Squared Error): ")
for i, predict in enumerate(predict_list):
    print(f"\n* {type(model_list[i]).__name__}: {mean_squared_error(y_test, predict):.2f}\t")


MSE (Mean Squared Error): 

* LassoCV: 26454171653.08	

* LinearRegression: 44579983507.45	

* ElasticNetCV: 26511378631.54	

* Ridge: 44579983516.86	

* GradientBoostingRegressor: 536441874.59	
