In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('../data/raw/apartments_for_rent_classified_100K.csv', delimiter=';',encoding="ISO-8859-1")
data.head()

# Exploration


In [None]:
data.describe()

## Type check

In [None]:
data.dtypes

# Price currency types

In [None]:
import seaborn as sns
sns.heatmap(data.corr(numeric_only=True), annot=True)

## Data transform

In [None]:
import statsmodels.api as sm

In [None]:
features = data[['bathrooms','bedrooms','square_feet']]
target = data['price']

In [None]:
data_to_regressor= sm.add_constant(features,prepend=False)
data_to_regressor

In [None]:
target.fillna(0, inplace=True)
data_to_regressor.fillna(0, inplace=True)

In [None]:
mod = sm.OLS(target, data_to_regressor)

In [None]:
res = mod.fit()

In [None]:
res.summary()

## Treinando um modelo


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

## Train pipeline

In [None]:
data.isna().sum()
data['pets_allowed'].fillna('not_specified', inplace=True)
data['cityname'].fillna('not_specified', inplace=True)
data['state'].fillna('not_specified', inplace=True)
data['amenities'].fillna('not_specified', inplace=True)
data['bedrooms'] = data['bedrooms'].fillna(data['bedrooms'].mode().iloc[0])
data['bathrooms'] = data['bathrooms'].fillna(data['bathrooms'].mode().iloc[0])


In [None]:
cat_pipline = OneHotEncoder(handle_unknown='ignore')
numerical_pipeline = StandardScaler()
categorical_columns = ["currency","fee","pets_allowed","category","cityname","price_type","state"]
numerical_columns = ["bathrooms","bedrooms","square_feet"]
column_transform = ColumnTransformer([('categorical_pipeline', cat_pipline, categorical_columns ), ('numerical_pipeline', numerical_pipeline, numerical_columns)])

In [None]:
train_pipeline = Pipeline([('column_transform', column_transform), ('regressor', LinearRegression())])

## Spliting data


In [None]:
from sklearn.model_selection import train_test_split

X = data[["currency","fee","pets_allowed","category","cityname","price_type","state","bathrooms","bedrooms","square_feet"]]
target = data["price"].copy()

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [None]:
train_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

y_pred = train_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("MSE: ", mse)
print("MAE: ", mae)
print("R2: ", r2)
print("MAPE: ", mape)

# Saving the model

In [None]:
import pickle
pickle.dump(train_pipeline, open('../model/model.sav', 'wb'))

In [None]:
model = pickle.load(open('../model/model.sav','rb'))
model