In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True) #do not miss this line
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data.csv',sep = ',', index_col= 'Unnamed: 0')
data.head(3)

Unnamed: 0,place_name,price_usd_per_m2,surface_total_in_m2,lat,lon,floor,rooms,description,title,avenida,...,lavadero,Belgrano,Nuñez,Palermo,Palermo Hollywood,Recoleta,surface_total_in_m2_2,distancia,ambientes,habitaciones
7,Belgrano,3066.666667,45.0,-34.559873,-58.443362,,,EXCELENTE MONOAMBIENTE A ESTRENAR AMPLIO SUPER...,JOSE HERNANDEZ 1400 MONOAMBIENTE ESTRENAR CAT...,1,...,1,1,0,0,0,0,2025.0,7.344734,,
8,Belgrano,3000.0,65.0,-34.559873,-58.443362,,,EXCELENTE DOS AMBIENTES ESTRENAR AMPLIO SUPER...,"JOSE HERNANDEZ 1400 DOS AMBIENTES ESTRENAR ,...",1,...,1,1,0,0,0,0,4225.0,7.344734,,
19,Palermo,3365.384615,104.0,-34.580504,-58.405874,,3.0,Excelente semipiso al contra frente en Bulnes ...,"Bulnes y Libertador: espectacular pulmón, con ...",0,...,0,0,0,1,0,0,10816.0,3.029007,,


In [3]:
data.shape

(4688, 29)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [5]:
# Definimos una función que acepta una lista de features, hace el split entre train y test,
# reservando un 25% de las observaciones para testeo, y devuelve la prueba RMSE.

def train_test_rmse(feature_cols):
    X = data[feature_cols]
    y = data.price_usd_per_m2
    # Como estamos trabajando con observaciones ordenadas en el tiempo, ponemos
    # shuffle=False para evitar data leakage
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    return  print('MAE:', metrics.mean_absolute_error(y_test, y_pred),
    print('MSE:', metrics.mean_squared_error(y_test, y_pred),
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
    print('R2:', metrics.r2_score(y_test, y_pred)))))

In [6]:
train_test_rmse(['surface_total_in_m2'])

R2: 0.0028472173576031823
RMSE: 771.2421597934635 None
MSE: 594814.4690428864 None
MAE: 599.391806513355 None


In [7]:
data.rooms.isnull().sum()

1917

In [8]:
data.dropna(axis=0,subset=['distancia'],inplace=True)

In [9]:
import statsmodels.api as sm

In [10]:
feature_cols = ['surface_total_in_m2']

X = data[feature_cols]

y = data.price_usd_per_m2

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

##

                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     6.958
Date:                Tue, 17 Jan 2023   Prob (F-statistic):            0.00837
Time:                        23:55:49   Log-Likelihood:                -37761.
No. Observations:                4679   AIC:                         7.553e+04
Df Residuals:                    4677   BIC:                         7.554e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                3030.3677    

In [11]:
feature_cols = ['distancia']

X = data[feature_cols]

y = data.price_usd_per_m2

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

##

                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     21.59
Date:                Tue, 17 Jan 2023   Prob (F-statistic):           3.47e-06
Time:                        23:55:49   Log-Likelihood:                -37754.
No. Observations:                4679   AIC:                         7.551e+04
Df Residuals:                    4677   BIC:                         7.552e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3160.0096     22.746    138.926      0.0

In [12]:
feature_cols = ['surface_total_in_m2','distancia','lat','lon','avenida','balcon','gimnasio','SUM','cochera','parrilla','pileta','patio','terraza','lavadero','Belgrano','Palermo','Nuñez','Palermo Hollywood','Recoleta']

X = data[feature_cols]

y = data.price_usd_per_m2

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

##

                            OLS Regression Results                            
Dep. Variable:       price_usd_per_m2   R-squared:                       0.322
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     116.4
Date:                Tue, 17 Jan 2023   Prob (F-statistic):               0.00
Time:                        23:55:49   Log-Likelihood:                -36855.
No. Observations:                4679   AIC:                         7.375e+04
Df Residuals:                    4659   BIC:                         7.388e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                2.191e+06   2

In [13]:
# Definimos una función que acepta una lista de features, hace el split entre train y test,
# reservando un 25% de las observaciones para testeo, y devuelve la prueba RMSE.

feature_cols = ['surface_total_in_m2','distancia','lat','lon','avenida','balcon','gimnasio','SUM','cochera','parrilla','pileta','patio','terraza','lavadero','Belgrano','Palermo','Nuñez','Palermo Hollywood','Recoleta']
X = data[feature_cols]
y = data.price_usd_per_m2
    # Como estamos trabajando con observaciones ordenadas en el tiempo, ponemos
    # shuffle=False para evitar data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

MAE: 489.8852264102154
MSE: 419435.94627862435
RMSE: 647.6387467397425
R2: 0.29556742688908755


In [14]:
# Ajustamos el modelo usando solamente las variables cuantitativas aplicando regularización
#¿Hace falta normalizar los features antes aplicar regularización en este caso? ¿Qué unidades tienen los features?
from sklearn.linear_model import RidgeCV

# Definimos el rango de de búsqueda del hiperparametro explicitamente
lm_ridge = RidgeCV(alphas=[0.1, 1, 10], normalize=True)

model_2 = lm_ridge.fit(X_train, y_train)

print('Score model_2:', model_2.score(X_test, y_test))

# ¿Mejoraron los resultados?

Score model_2: 0.282347085837903
