# Template de análisis y entrenamiento de modelos

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import math
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeClassifierCV, LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
#LINK DE DESCARGA DEL CSV
#https://drive.google.com/file/d/15Ofxvl8CbGrq2BjwmE7GjRCMR_YixKb4/view
data_location = "./properati.csv"

data = pd.read_csv(data_location, sep=",")
df = pd.read_csv(data_location, sep=",")

df.drop(['operation', 'Unnamed: 0','geonames_id','lat-lon','lat','lon','properati_url','image_thumbnail', 'price_aprox_local_currency', "expenses", "floor"], axis=1, inplace=True)
df.dtypes

## Cantidad de nulos iniciales en el dataframe

In [None]:
df.isnull().sum()

## Inicio de Etapa de Busqueda de Datos
### Búsqueda de M2 en description

In [None]:
m2_pattern = "\s(?P<metros>\d{0,3}?[.]?\d*)\s?(?P<sufijo>m2|M2|metros|mts|m²)"
m2_regex =  re.compile(m2_pattern)
m2_match = df.description.apply(lambda x: x if x is np.NaN else m2_regex.search(x))
m2_match_mask = m2_match.notnull()
df.loc[m2_match_mask, "M2"] = m2_match[m2_match_mask].apply(lambda x: x.group("metros"))

### Búsqueda de ambientes en description

In [None]:
rooms_pattern = "\s(?P<ambientes>\d\d?)(\s?)(?P<sufijo>AMB|amb|Amb)"
rooms_regex =  re.compile(rooms_pattern)

rooms_match = df.description.apply(lambda x: x if x is np.NaN else rooms_regex.search(x))
rooms_match_mask = rooms_match.notnull()
df.loc[rooms_match_mask, "Ambientes"] = rooms_match[rooms_match_mask].apply(lambda x: x.group("ambientes"))

### Ubicación

In [None]:
df["place_name"].fillna(value="Tigre", inplace=True)

In [None]:
df['place_with_parent_names_clean'] = df['place_with_parent_names']

total_place = df['place_with_parent_names_clean'].apply(lambda row: len(row.split('|'))-2)
dic_ret = dict()
for i in range(total_place.max()):
    dic_ret['place_'+ str(i)] = ([None] * total_place.shape[0])

In [None]:
for row in df['place_with_parent_names_clean'].index:
    i=0;
    list_places = df['place_with_parent_names_clean'].loc[row].split('|')
    for x in list_places:
        if len(x)>0:
            colname='place_' + str(i)
            dic_ret[colname][row] = x
            i+=1

In [None]:
new_columns = pd.DataFrame(dic_ret)
df = df.join(new_columns)

In [None]:
df[['country_name','state_name','place_name','place_0','place_1','place_2','place_3','place_4']].sample(10)

In [None]:
df[['Provincia','Ciudad','Departamento','Barrio']] = df[['place_1','place_2','place_3','place_4']]

In [None]:
df.drop(columns=["place_0", "place_4", 'place_1','place_2','place_3'], inplace=True)

In [None]:
df['Departamento'] = df['Departamento'].fillna("_")
df['Barrio'] = df['Barrio'].fillna("_")
df['Localidad'] = df['Ciudad']+df['Departamento']+df['Barrio']
df[['Localidad','Ciudad','Departamento','Barrio']].head(5)

### Ambientes

In [None]:
df["rooms_final"] = df["rooms"]
df["Ambientes"] = df["Ambientes"].astype(float)
df.loc[(pd.isnull(df["rooms_final"])),"rooms_final"] = df["Ambientes"]
print('Cantidad rooms null en df inicial:', df["rooms"].isnull().sum())
print('Cantidad rooms null en df trabajado:', df["rooms_final"].isnull().sum())

#### Limpieza de Outliers

In [None]:
maskNotNull_Rooms = df['rooms_final'].notnull()
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_Rooms,'property_type'],y=df.loc[maskNotNull_Rooms,'rooms_final'])
plt.title("Distribución de cuartos según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Cuartos",fontsize=11)

In [None]:
df['outliers_rooms'] = None
for i in df['property_type'].unique():
    rangoInterquartil_rooms= df.loc[df['property_type']==i,'rooms_final'].quantile(0.75) - df.loc[df['property_type']==i,'rooms_final'].quantile(0.25)
    umbral_lower_rooms = df.loc[df['property_type']==i,'rooms_final'].quantile(0.25) - rangoInterquartil_rooms*1.5
    umbral_upper_rooms = df.loc[df['property_type']==i,'rooms_final'].quantile(0.75) + rangoInterquartil_rooms*1.5


    df.loc[maskNotNull_Rooms & (df['property_type']==i), 'outliers_rooms']= df.loc[maskNotNull_Rooms & (df['property_type']==i), 'rooms_final'].apply\
                                                                                    (lambda x: "Outlier superior" if x >umbral_upper_rooms else(
                                                                                 "Outlier inferior" if x<umbral_lower_rooms else 
                                                                                 "No es outlier"))

outliers_rooms = df['outliers_rooms']
df = df.drop(columns = ['outliers_rooms'])
pd.DataFrame(round((outliers_rooms.value_counts() / outliers_rooms[maskNotNull_Rooms].shape[0])*100,1))

maskOutliers= outliers_rooms !="No es outlier"
df.loc[maskOutliers,'rooms_final'] = np.NaN

In [None]:
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_Rooms,'property_type'],y=df.loc[maskNotNull_Rooms,'rooms_final'])
plt.title("Distribución de cuartos según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Cuartos",fontsize=11)

#### Imputacion de ambientes por el promedio por propiedad

In [None]:
rooms_property_mean = pd.DataFrame(df.groupby(by="property_type")["rooms_final"].mean())
rooms_property_mean.columns = ['rooms_clean_mean']
rooms_property_mean['rooms_clean_mean'] = rooms_property_mean['rooms_clean_mean'].apply(lambda x: math.floor(x))

In [None]:
df = df.merge(rooms_property_mean, on="property_type")

In [None]:
rooms_null = df["rooms_final"].isnull()

In [None]:
df.loc[rooms_null,'rooms_final'] = df.loc[rooms_null,'rooms_clean_mean'] 
df = df.drop(columns=['rooms_clean_mean'])

In [None]:
print('Cantidad rooms null en df despues de la imputacion:', df["rooms_final"].isnull().sum())

## m2

In [None]:
df["M2"].replace(to_replace = "", value= np.NaN, inplace = True)
df["M2"] = df["M2"].astype(float)

In [None]:
df['surface_total_in_m2_clean'] = df['surface_total_in_m2']

In [None]:
maskNotNull_TotalAndCovered = df['surface_covered_in_m2'].notnull() & df['surface_total_in_m2'].notnull()
df["covered_pct"] = df.loc[maskNotNull_TotalAndCovered, 'surface_covered_in_m2'] / df.loc[maskNotNull_TotalAndCovered, 'surface_total_in_m2'] 
df["covered_pct"] = df['covered_pct'].apply(lambda x: 1 if x>1 else x)
meanByProperty = pd.DataFrame(df.groupby('property_type')['covered_pct'].mean())
meanByProperty.columns = ['covered_pct_mean']
print("Las casas son las que menos metros cuadrados tienen cubierto")
meanByProperty.sort_values(by = 'covered_pct_mean', ascending=False)

In [None]:
df = df.merge(meanByProperty, on='property_type')

In [None]:
maskFill = df['surface_covered_in_m2'].notnull() & df['surface_total_in_m2'].isnull()
surface_total_parche = df.loc[maskFill,'surface_covered_in_m2'] / df.loc[maskFill,'covered_pct_mean']
df.loc[maskFill, 'surface_total_in_m2_clean'] = surface_total_parche[maskFill] 

In [None]:
df['metros1'] = df[(df['surface_total_in_m2_clean'].notnull()) & (df['M2'].isnull())]["surface_total_in_m2_clean"]
df['metros2'] = df[(df['surface_total_in_m2_clean'].isnull()) & (df['M2'].notnull())]['M2']
df['metros3'] = df[(df['surface_total_in_m2_clean'].notnull()) & (df['M2'].notnull())]['surface_total_in_m2_clean']

df["metros1"].fillna(0, inplace=True)
df["metros2"].fillna(0, inplace=True)
df["metros3"].fillna(0, inplace=True)

df['m2_final'] = df.apply(lambda x: x['metros1'] + x['metros2'] + x["metros3"], axis=1)

df["m2_final"].replace(to_replace = 0, value= np.NaN, inplace = True)

df.drop(columns = ["metros1", "metros2", "metros3"], inplace = True)

In [None]:
df['m2_final'] = df['m2_final'].apply(lambda x: np.NaN if x < 20 else x)

#### Limpieza de Outliers

In [None]:
maskNotNull_m2 = df['m2_final'].notnull()
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_m2,'property_type'],y=df.loc[maskNotNull_m2,'m2_final'])
plt.title("Distribución de m2 según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("m2",fontsize=11)

In [None]:
df['outliers_m2'] = None
for i in df['property_type'].unique():
    rangoInterquartil_m2= df.loc[df['property_type']==i,'m2_final'].quantile(0.75) - df.loc[df['property_type']==i,'m2_final'].quantile(0.25)
    umbral_lower_m2 = df.loc[df['property_type']==i,'m2_final'].quantile(0.25) - rangoInterquartil_m2*1.5
    umbral_upper_m2 = df.loc[df['property_type']==i,'m2_final'].quantile(0.75) + rangoInterquartil_m2*1.5


    df.loc[maskNotNull_Rooms & (df['property_type']==i), 'outliers_m2']= df.loc[maskNotNull_m2 & (df['property_type']==i), 'm2_final'].apply\
                                                                                    (lambda x: "Outlier superior" if x >umbral_upper_m2 else(
                                                                                 "Outlier inferior" if x<umbral_lower_m2 else 
                                                                                 "No es outlier"))

outliers_m2 = df['outliers_m2']
df = df.drop(columns = ['outliers_m2'])
pd.DataFrame(round((outliers_m2.value_counts() / outliers_m2[maskNotNull_m2].shape[0])*100,1))

maskOutliers= outliers_m2 !="No es outlier"
df.loc[maskOutliers,'m2_final'] = np.NaN

In [None]:
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_m2,'property_type'],y=df.loc[maskNotNull_m2,'m2_final'])
plt.title("Distribución de m2 según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("m2",fontsize=11)

## Nuevas variables

In [None]:
garage_pattern = "(?P<garage>cochera|garage|estacionamiento)"
garage_regex =  re.compile(garage_pattern)

garage_match = df.description.apply(lambda x: x if x is np.NaN else garage_regex.search(x))
garage_match_mask = garage_match.notnull()
df.loc[garage_match_mask, "Garage"] = 1
df["Garage"].fillna(0,inplace=True)

In [None]:
pileta_pattern = "(?P<pileta>pileta|piscina|picina|pisina)"
pileta_regex =  re.compile(pileta_pattern)

pileta_match = df.description.apply(lambda x: x if x is np.NaN else pileta_regex.search(x))
pileta_match_mask = pileta_match.notnull()
df.loc[pileta_match_mask, "Pileta"] = 1
df["Pileta"].fillna(0, inplace=True)
df.loc[:,"Pileta"] = df.loc[:,"Pileta"].astype(int)

In [None]:
jacuzzi_pattern = "(?P<jacuzzi>jacuzzi|sauna|jacuzi|jacusi)"
jacuzzi_regex =  re.compile(jacuzzi_pattern)

jacuzzi_match = df.description.apply(lambda x: x if x is np.NaN else jacuzzi_regex.search(x))
jacuzzi_match_mask = jacuzzi_match.notnull()
df.loc[jacuzzi_match_mask, "Jacuzzi"] = 1
df["Jacuzzi"].fillna(0, inplace=True)
df.loc[:,"Jacuzzi"] = df.loc[:,"Jacuzzi"].astype(int)

In [None]:
balcon_pattern = "(?P<balcon>balcon|balcones|balcón)"
balcon_regex =  re.compile(balcon_pattern)

balcon_match = df.description.apply(lambda x: x if x is np.NaN else balcon_regex.search(x))
balcon_match_mask = balcon_match.notnull()
df.loc[balcon_match_mask, "Balcon"] = 1
df["Balcon"].fillna(0, inplace=True)
df.loc[:,"Balcon"] = df.loc[:,"Balcon"].astype(int)

In [None]:
gym_pattern = "(?P<gym>gym|gimnasio|gimnacio)"
gym_regex =  re.compile(gym_pattern)

gym_match = df.description.apply(lambda x: x if x is np.NaN else gym_regex.search(x))
gym_match_mask = gym_match.notnull()
df.loc[gym_match_mask, "Gimnasio"] = 1
df["Gimnasio"].fillna(0, inplace=True)
df.loc[:,"Gimnasio"] = df.loc[:,"Gimnasio"].astype(int)

In [None]:
seguridad_pattern = "(?P<seguridad>seguridad|vigilancia|c[a|á]maras|ojo de halc[o|ó]n|vigilador|garita|circuito cerrado|cctv|monitoreo)"
seguridad_regex =  re.compile(seguridad_pattern)

seguridad_match = df.description.apply(lambda x: x if x is np.NaN else seguridad_regex.search(x))
seguridad_match_mask = seguridad_match.notnull()
df.loc[seguridad_match_mask, "Seguridad"] = 1
df["Seguridad"].fillna(0, inplace=True)
df.loc[:,"Seguridad"] = df.loc[:,"Seguridad"].astype(int)

In [None]:
quincho_pattern = "(?P<quincho>quincho|kincho|qincho|(\ssum\s))"
quincho_regex =  re.compile(quincho_pattern)

quincho_match = df.description.apply(lambda x: x if x is np.NaN else quincho_regex.search(x))
quincho_match_mask = quincho_match.notnull()
df.loc[quincho_match_mask, "Quincho"] = 1
df["Quincho"].fillna(0, inplace=True)
df.loc[:,"Quincho"] = df.loc[:,"Quincho"].astype(int)

In [None]:
estrenar_pattern = "(?P<estrenar>(a estrenar)|(departamento nuevo))"
estrenar_regex =  re.compile(estrenar_pattern)

estrenar_match = df.description.apply(lambda x: x if x is np.NaN else estrenar_regex.search(x))
estrenar_match_mask = estrenar_match.notnull()
df.loc[estrenar_match_mask, "Estrenar"] = 1
df["Estrenar"].fillna(0, inplace=True)
df.loc[:,"Estrenar"] = df.loc[:,"Estrenar"].astype(int)

## Precio
#### Limpieza de Outliers

In [None]:
maskUSD = df['currency'] == "USD"
df.loc[maskUSD, 'price_aprox_usd_clean'] = df.loc[maskUSD, 'price']

In [None]:
maskNotNull_price = df['price_aprox_usd_clean'].notnull()
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_price,'property_type'],y=df.loc[maskNotNull_price,'price_aprox_usd_clean'])
plt.title("Distribución de precios según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Precios",fontsize=11)

In [None]:
df['outliers_precio'] = None
for i in df['property_type'].unique():
    rangoInterquartil_precio= df.loc[df['property_type']==i,'price_aprox_usd_clean'].quantile(0.75) - df.loc[df['property_type']==i,'price_aprox_usd_clean'].quantile(0.25)
    umbral_lower_precio = df.loc[df['property_type']==i,'price_aprox_usd_clean'].quantile(0.25) - rangoInterquartil_precio*1.5
    umbral_upper_precio = df.loc[df['property_type']==i,'price_aprox_usd_clean'].quantile(0.75) + rangoInterquartil_precio*1.5


    df.loc[maskNotNull_price & (df['property_type']==i), 'outliers_precio']= df.loc[maskNotNull_price & (df['property_type']==i), 'price_aprox_usd_clean'].apply\
                                                                                    (lambda x: "Outlier superior" if x >umbral_upper_precio else(
                                                                                 "Outlier inferior" if x<umbral_lower_precio else 
                                                                                 "No es outlier"))

outliers_precio = df['outliers_precio']
df = df.drop(columns = ['outliers_precio'])
pd.DataFrame(round((outliers_precio.value_counts() / outliers_precio[maskNotNull_price].shape[0])*100,1))

maskOutliers= outliers_precio !="No es outlier"
df.loc[maskOutliers,'price_aprox_usd_clean'] = np.NaN

In [None]:
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_price,'property_type'],y=df.loc[maskNotNull_price,'price_aprox_usd_clean'])
plt.title("Distribución de precios según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Precios",fontsize=11)

## Precio por m2

In [None]:
maskNotNull = df['price_aprox_usd_clean'].notnull() & df['m2_final'].notnull()
df['price_usd_per_m2_final'] = (df.loc[maskNotNull, 'price_aprox_usd_clean'] / df.loc[maskNotNull, 'm2_final']).round(0)

#### Limpieza de Outliers

In [None]:
maskNotNull_price_m2 = df['price_usd_per_m2_final'].notnull()
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_price,'property_type'],y=df.loc[maskNotNull_price,'price_usd_per_m2_final'])
plt.title("Distribución de precios según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Precios",fontsize=11)

In [None]:
df['outliers_precio_m2'] = None
for i in df['property_type'].unique():
    rangoInterquartil_precio_m2= df.loc[df['property_type']==i,'price_usd_per_m2_final'].quantile(0.75) - df.loc[df['property_type']==i,'price_usd_per_m2_final'].quantile(0.25)
    umbral_lower_precio_m2 = df.loc[df['property_type']==i,'price_usd_per_m2_final'].quantile(0.25) - rangoInterquartil_precio_m2*1.5
    umbral_upper_precio_m2 = df.loc[df['property_type']==i,'price_usd_per_m2_final'].quantile(0.75) + rangoInterquartil_precio_m2*1.5


    df.loc[maskNotNull_price_m2 & (df['property_type']==i), 'outliers_precio_m2']= df.loc[maskNotNull_price_m2 & (df['property_type']==i), 'price_usd_per_m2_final'].apply\
                                                                                    (lambda x: "Outlier superior" if x >umbral_upper_precio_m2 else(
                                                                                 "Outlier inferior" if x<umbral_lower_precio_m2 else 
                                                                                 "No es outlier"))

outliers_precio_m2 = df['outliers_precio_m2']
df = df.drop(columns = ['outliers_precio_m2'])
pd.DataFrame(round((outliers_precio_m2.value_counts() / outliers_precio_m2[maskNotNull_price_m2].shape[0])*100,1))

maskOutliers= outliers_precio_m2 !="No es outlier"
df.loc[maskOutliers,'price_usd_per_m2_final'] = np.NaN

In [None]:
f, ax = plt.subplots()
f.set_figwidth(12)
f.set_figheight(7)
sns.boxplot(x=df.loc[maskNotNull_price,'property_type'],y=df.loc[maskNotNull_price,'price_usd_per_m2_final'])
plt.title("Distribución de precios según propiedad",fontsize=15)
plt.xlabel("Tipo de propiedad",fontsize=11)
plt.ylabel("Precios",fontsize=11)

## Desafio 2

In [None]:
df.drop(columns=['place_with_parent_names',
       'country_name', 'state_name', 'price', 'currency', 'price_aprox_usd',
       'surface_total_in_m2', 'surface_covered_in_m2', 'price_per_m2', 'rooms', 'description', 'title', 'M2', 'Ambientes', 'surface_total_in_m2_clean', 'covered_pct',
       'covered_pct_mean', "place_with_parent_names_clean", "place_name", "Ciudad", "Departamento", "Barrio", "price_aprox_usd_clean", "price_usd_per_m2"], inplace = True)

## Cargamos los datos limpios

In [None]:
df = df.dropna(subset=["m2_final", "Localidad", "price_usd_per_m2_final"])

In [None]:
df.shape

## Definimos nuestras variables endógenas (y) y exógenas (X).

In [None]:
X = df.drop(columns = "price_usd_per_m2_final")
y = df['price_usd_per_m2_final']

## Separamos sets de entrenamiento y testeo
Recordemos que no tienen que tocar `X_test` ni `y_test` hasta que hayan terminado de buscar el mejor modelo posible.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True, random_state=10)

## Generamos las dummies

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
dummies = ohe.fit_transform(X_train[["property_type", "Provincia", "Localidad"]])

In [None]:
columns_dummies = []
for cat in ohe.categories_:
	columns_dummies.extend(cat[:])

In [None]:
dummies = pd.DataFrame(dummies, columns=columns_dummies, index=X_train.index)

In [None]:
X_train = X_train.join(dummies)
X_train = X_train.drop(columns = ["property_type", "Provincia", "Localidad"], axis=1)

In [None]:
dummies_test = ohe.transform(X_test[["property_type", "Provincia", "Localidad"]])

In [None]:
dummies_test = pd.DataFrame(dummies_test, columns=columns_dummies, index=X_test.index)

In [None]:
X_test = X_test.join(dummies_test)
X_test = X_test.drop(columns = ["property_type", "Provincia", "Localidad"], axis=1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Entrenar un modelo súper sencillo como para ver dónde están parados
Recomiendo utilizar statsmodels ya que les va a dar algo extra de información respecto de los betas obtenidos

In [None]:
X_train

In [None]:
baseline = sm.OLS(y_train, sm.add_constant(X_train)).fit()
baseline.summary()

In [None]:
coef_df = pd.DataFrame(baseline.params)
coef_df["beta"] = coef_df[0]
coef_df.drop(columns=0, inplace=True)

In [None]:
coef_df.shape()

In [None]:
#sns.scatterplot(x=df['cuadrado'],y=df["price_usd_per_m2_final"])

In [None]:
#sns.scatterplot(x=df['m2_final'],y=df["price_usd_per_m2_final"])

## LASSO

In [None]:
kf = KFold(shuffle=True, random_state=10)

In [None]:
model_skl_LassoCV = linear_model.LassoCV(alphas=np.linspace(0.0001,0.01, 100), cv=kf, normalize=True)
model_skl_LassoCV.fit(X_train, y_train)
print("Alpha óptimo:", model_skl_LassoCV.alpha_)
print("R2 Lasso de entrenamiento:",model_skl_LassoCV.score(X_train, y_train).round(3))

In [None]:
betas = pd.DataFrame(data= model_skl_LassoCV.coef_ , index=X_train.columns, columns=['Betas'])

betas_0_mask = betas.Betas == 0
print("Variables igualadas a 0 por Lasso:", betas_0_mask.sum())

In [None]:
model_skl_LassoCV = linear_model.LassoCV(alphas=np.linspace(0.001,0.1, 100), cv=kf, normalize=True)
model_skl_LassoCV.fit(X_train, y_train)
print("Alpha óptimo:", model_skl_LassoCV.alpha_)
print("R2 Lasso de entrenamiento:",model_skl_LassoCV.score(X_train, y_train).round(3))

In [None]:
betas = pd.DataFrame(data= model_skl_LassoCV.coef_ , index=X_train.columns, columns=['Betas'])

betas_0_mask = betas.Betas == 0
print("Variables igualadas a 0 por Lasso:", betas_0_mask.sum())

## RIDGE

In [None]:
model_skl_RidgeCV = linear_model.RidgeCV(alphas=np.linspace(0.001,0.1, 100), cv=kf, normalize=True)
model_skl_RidgeCV.fit(X_train, y_train)
print("Alpha óptimo:", model_skl_RidgeCV.alpha_)
print("R2 Ridge de entrenamiento:",model_skl_RidgeCV.score(X_train, y_train).round(3))

## PREDICCIONES

In [None]:
prediction_sm = baseline.predict(sm.add_constant(X_test))
print ('R2 Statmodels OLS:', r2_score(y_test, prediction_sm).round(3))

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(y_test,prediction_sm)
sns.lineplot([0,3500],[0,3500],color='black')
plt.title("Evaluación de predicciónes")
plt.ylabel("Y_pred")
plt.xlabel("Y_test")