In [2]:
from model import get_final_df
import plotly.express as px
import pandas as pd
from collections import Counter


df = pd.read_parquet("../data/flats.parquet")
print(df.head(1))

  propertyCode  price        date propertyType   size  exterior  rooms  \
0     99170121  75000  2022-10-30       chalet  162.0         1      5   

             district  bathrooms  province  ... Zona_Monte_Real-Calicanto  \
0  Zona Casco Antiguo          3  València  ...                         0   

  Zona_Parc_Central-Hort_de_Trenor Zona_Parc_Central_Hort_de_Trenor  \
0                                0                                0   

   Zona_Poble_Nou  Zona_Pueblo  Zona_el_Molí Zona_pueblo  good  \
0               0            0             0           0     0   

   newdevelopment  renew  
0               0      1  

[1 rows x 181 columns]


In [3]:
# df = df.loc[df["size"] < 300.0]

df = df.loc[df["penthouse"] == True]

distr_list = [
    "Centro", "Ciutat_Vella", "LEixample", "Algirós", "Camins_al_Grau", "Quatre_Carreres", "Poblats_Marítims"]
df = df[df["district_renamed"].isin(distr_list)]

fig = px.scatter(x=df["size"], y=df["price"], color=df["district_renamed"], trendline="ols")
fig.show()

In [3]:
def check_nans_df(df):
    for col in list(df):
        print(col, df[col].isna().sum())

In [13]:
print(f"Number of columns in the database: {len(list(df))}")

Number of columns in the database: 181


In [4]:
# check_nans_df(df)

propertyCode 0
price 0
date 0
propertyType 0
size 0
exterior 0
rooms 0
district 0
bathrooms 0
province 0
municipality 0
description 94
status 1
newDevelopment 0
priceChangeCount 0
priceArea 0
district_renamed 0
chalet 0
countryHouse 0
duplex 0
flat 0
penthouse 0
studio 0
Alameda_Park 0
Alboraya_Centro 0
Algirós 0
Antigua_Moreria 0
Avda_de_Abril_9_de_Octubre 0
Avenida_de_la_Paz 0
Barranquet_El_Salvador 0
Barrio_de_la_Luz 0
Bega_de_Mar 0
Benicalap 0
Benimaclet 0
Beniopa_San_Pere 0
Benipeixcar_El_Raval 0
Bonavista_Lago_San_Lorenzo 0
Calicanto_Monte_Real 0
Camins_al_Grau 0
Campanar 0
Campolivar 0
Camì_Paterna-Lloma_del_Calderer 0
Cardenal_Benlloch 0
Casas_Verdes 0
Casc_Urbà 0
Casco_Antiguo 0
Casco_Urbano 0
Centro 0
Centro_Ciudad 0
Centro_El_Castillo 0
Centro_Puerto 0
Ciutat_Vella 0
Corral_Nou_Montros_Estepar 0
Doctor_Palos_Alto_Palancia 0
El_Alborgí 0
El_Bosque 0
El_Carme-Sant_Agustí-Bonavista 0
El_Carmen 0
El_Castell 0
El_Corralet_Bonanza_Tres_Rutas 0
El_Dossel 0
El_Faro 0
El_Mercado 0
El

# Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df_linear = df[df["district_renamed"].isin(distr_list)]
df_linear = df_linear[distr_list + ["price", "rooms", 'size', 'exterior',  'bathrooms', 'priceChangeCount', 'good', 'newdevelopment', 'renew', 'chalet', 'countryHouse', 'duplex', 'flat', 'penthouse', 'studio', 'district_renamed']]

X = df_linear.drop(columns="price")
y = df_linear["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.drop(columns="district_renamed")
X_test = X_test.drop(columns="district_renamed")

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

model = LinearRegression().fit(X_train, y_train)

6000
6000
1501
1501


In [10]:
r_sq = model.score(X_train, y_train)
print(f"coefficient of determination: {r_sq}")

print(f"intercept: {model.intercept_}")

print(f"coefficients: {model.coef_}")

coefficient of determination: 0.7201943298419784
intercept: 53249.72944036656
coefficients: [-2.00893460e+05  1.03085480e+05  1.24123906e+05 -1.36670921e+04
  1.30991177e+04 -1.89536707e+04 -6.79428067e+03 -1.95615517e+04
  2.77668760e+03 -3.67435860e-10  5.61854265e+04 -5.89504383e+03
 -1.13109861e+05 -9.11914505e+04 -1.52263397e+05 -6.16968701e+04
 -2.02648358e+04  4.10483075e+04 -1.80656796e+04  1.08113030e+05
 -4.91339526e+04]


In [11]:
y_pred = model.predict(X_test)
print(y_pred)

[ 66077.45676615 414608.13673576 225523.83487758 ... 250793.93355428
 791946.22364195 -28590.43368443]


In [12]:
result_linear = X_test.copy()
result_linear["price_true"] = y_test
result_linear["price_predicted"] = y_pred

print(result_linear.head(2))

       Centro  Ciutat_Vella  LEixample  Algirós  Camins_al_Grau  \
16731       1             0          0        0               0   
23930       0             1          0        0               0   

       Quatre_Carreres  Poblats_Marítims  rooms   size  exterior  ...  \
16731                0                 0      3  107.0         1  ...   
23930                0                 0      2  116.0         1  ...   

       newdevelopment  renew  chalet  countryHouse  duplex  flat  penthouse  \
16731               0      0       0             0       0     1          0   
23930               0      0       0             0       0     1          0   

       studio  price_true  price_predicted  
16731       0      135000     66077.456766  
23930       0      350000    414608.136736  

[2 rows x 23 columns]
