<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
from decouple import config
import pandas as pd
import numpy as np
from backend.utils import *
from sklearn import linear_model
import statsmodels.api as sm

pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)

In [3]:
from scrapper.database.mysql import Database
db = Database(config('db_host'), config('db_port'), config('db_database'), config('db_user'), config('db_password'))
db.test_connection()

Test: Database connection.
Success: Connection to database.


True

In [4]:
table = 'properties'
df = get_sqlalchemy_table_to_pandas(table, db)

In [5]:
# Get only interesting columns
df = drop_unnecessary_columns(df)
# Delete properties without price
df = df[df.price>0]

In [6]:
df = df[df.portal=='pisos']

In [7]:
# Assume NaNs are E by energy_certificate
df.energy_certificate.fillna('E', inplace=True)

In [8]:
# Floor and built-up area
df['floor_area'] = df[['usable_floor_area','floor_area']].min(axis=1)
df['built_up_area'] = df[['built_up_area', 'floor_area']].max(axis=1)
df.floor_area.fillna(df.built_up_area, inplace=True)
df = df[df.floor_area.notna()]
df.drop(columns=['usable_floor_area', 'built_up_area'], inplace=True)

In [9]:
# Delete weird floors
df = df[df.floor<=10]

# Study only locations in Barcelona
df = df[df.location.str.contains('Barcelona')]

In [10]:
# if 'Barcelona Capital' in text:
#     # Get neighborhood
#     pattern = r'^[^\(]+'
#     neighborhood = re.search(pattern, text)
#     # Get district
#     pattern = r'(?<=\()(.*?)(?=\.)'
#     district = re.search(pattern, text)
#     # City
#     city = 'Barcelona Capital'
# return

In [11]:
# Drop where no n_rooms
df = df[df.n_rooms.notna()]

In [12]:
# No bathrooms --> 1
df.loc[(df.n_bathrooms.isna()),'n_bathrooms']=1

In [13]:
# Replace garage nans with 0
df.garage.fillna(0, inplace=True)

In [14]:
# Improve flooring
df.drop(columns=['flooring'], inplace=True)

In [15]:
# Condition
df = df[df.condition.notna()]
condition_dict = {
    'good': ['in good condition', 'good'],
    'remodelled': ['remodelled'],
    'new': ['brand new'],
    'very good': ['almost new', 'very good'],
    'to reform': ['to reform']
}
df['condition'] = df['condition'].apply(map_from_value_to_key, mapper=condition_dict)

In [16]:
# Heating
heating_dict = {
    'natural gas': ['natural gas', 'gas natural', 'yes', 'central', 'gasoil'],
    'electric': ['electricity', 'electricidad']
}
df['heating'] = df['heating'].apply(map_from_value_to_key, mapper=heating_dict)
df.heating.fillna('No', inplace=True)

In [17]:
# Air conditioning
air_conditioning_dict = {
    'cold and heat': ['cold and heat', 'frío-calor'],
    'cold': ['cold', 'yes']
}
df['air_conditioning'] = df['air_conditioning'].apply(map_from_value_to_key, mapper=air_conditioning_dict)
df.air_conditioning.fillna('No', inplace=True)

In [18]:
# Antiquity
antiquity_dict = {
    '50+': ['more than 50 years', '50 to 70 years', '70 to 100 years'],
    '30-50': ['between 30 and 50 years', '30 to 50 years'],
    '20-30': ['between 20 and 30 years', '20 to 30 years'],
    '10-20': ['between 10 and 20 years', '10 to 20 years'],
    '5-10': ['between 5 and 10 years'],
    '0-5': ['less than 5 years', '1 to 5 years']
}
df['antiquity'] = df['antiquity'].apply(map_from_value_to_key, mapper=antiquity_dict)
df.antiquity.fillna('10-20', inplace=True)

In [19]:
# Facing
df.loc[(df.facing.isna()),'facing']='West'
df = df[df.facing.isin(['South', 'East', 'Southeast', 'Southwest', 'West', 'North', 'Northeast', 'Northwest'])]

In [20]:
# Swimming pool
swimming_pool_dict = {
    'communal': ['communal'],
    'own': ['own', 'con piscina']
}
df['swimming_pool'] = df['swimming_pool'].apply(map_from_value_to_key, mapper=swimming_pool_dict)
df.swimming_pool.fillna('No', inplace=True)

In [21]:
# Garden
garden_dict = {
    'communal': ['communal'],
    'own': ['own', 'private', 'yes']
}
df['garden'] = df['garden'].apply(map_from_value_to_key, mapper=garden_dict)
df.garden.fillna('No', inplace=True)

In [22]:
# Delete portal
df.drop(columns=['portal'], inplace=True)

In [23]:
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
categorical_features = [feature for feature in df.columns if feature not in numerical_features]

In [24]:
df = get_dummies_from_categorical(categorical_features, df)

In [34]:
X = df.drop(columns=['price'])
Y = df[['price']]
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
display(model.summary())

0,1,2,3
Dep. Variable:,price,R-squared:,0.659
Model:,OLS,Adj. R-squared:,0.597
Method:,Least Squares,F-statistic:,10.5
Date:,"Thu, 15 Apr 2021",Prob (F-statistic):,1.35e-79
Time:,21:37:53,Log-Likelihood:,-9012.9
No. Observations:,656,AIC:,18230.0
Df Residuals:,553,BIC:,18690.0
Df Model:,102,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.966e+04,3.31e+04,1.197,0.232,-2.54e+04,1.05e+05
n_rooms,-2.866e+04,7677.299,-3.734,0.000,-4.37e+04,-1.36e+04
n_bathrooms,7.428e+04,2.21e+04,3.362,0.001,3.09e+04,1.18e+05
floor,3934.3996,5553.442,0.708,0.479,-6974.021,1.48e+04
equipped_kitchen,-3443.9965,2.44e+04,-0.141,0.888,-5.13e+04,4.44e+04
garage,126.1198,561.687,0.225,0.822,-977.182,1229.422
own_balcony,-2.176e-10,4.19e-10,-0.520,0.603,-1.04e-09,6.05e-10
terrace,1.788e+04,2.52e+04,0.709,0.478,-3.16e+04,6.74e+04
floor_area,4611.5780,365.816,12.606,0.000,3893.019,5330.137

0,1,2,3
Omnibus:,1166.997,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1035044.33
Skew:,11.382,Prob(JB):,0.0
Kurtosis:,196.26,Cond. No.,1.1e+17


In [48]:
results = pd.DataFrame(model.params, columns=['coef']).reset_index().rename(columns={'index': 'name'})
results = results[results.name.str.contains('location')].sort_values('coef')
results
# def get_most_expensive_districts(model):
#     model.coef



Unnamed: 0,name,coef
72,location_Vallbona (District Nou Barris. Barcelona Capital),-334651.130791
50,location_La Trinitat Vella (District Sant Andreu. Barcelona Capital),-287002.264523
31,location_El Turó de la Peira-Can Peguera (District Nou Barris. Barcelona Capital),-197339.630475
17,location_El Bon Pastor (District Sant Andreu. Barcelona Capital),-126007.953978
39,location_La Font de la Guatlla (District Sants-Montjuïc. Barcelona Capital),-105321.246926
...,...,...
56,location_Les Tres Torres (District Sarrià-Sant Gervasi. Barcelona Capital),188573.128151
59,location_Pedralbes (District Les Corts. Barcelona Capital),227084.459422
71,location_Sarrià (District Sarrià-Sant Gervasi. Barcelona Capital),231571.869315
35,location_La Barceloneta (District Ciutat Vella. Barcelona Capital),236656.780782


In [36]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(X, Y)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor()

<bound method BaseEstimator.get_params of RandomForestRegressor()>