In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
#from scipy import stats
#import matplotlib.pyplot as plt
#import seaborn as sns
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
#from sklearn.preprocessing import PolynomialFeatures, StandardScaler
#from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Importar pickle con el dataset preprocesado
df = pd.read_pickle('DatasetFinalParaEntrenar.pkl')

### Fit & Transform

In [3]:
# Separar Variables por tipo
categoricals = ['property_type','place_name','state_name','zona']
descriptions = ['texto']
numericals = ['surface_total_in_m2', 'surface_covered_in_m2', 'floor', 'rooms', 'expenses','lat','lon']
dummies = ['subte','colectivo','balcon','tren','porton','financia', 'parrilla','pileta', 'solarium', 'lavadero', 'estacionamiento', 'no_apto_credito','apto_credito', 'amplioliving', 'cocheras', 'frente', 'contrafrente','seguridad', 'amenities', 'SUM', 'espaciosverdes','jacuzzi','estrenar','aptoprofesional','pozo','categoria','reciclado','luminoso','acondicionado','quincho','escalera']
target = ['price_usd_per_m2']

In [4]:
# Separar Train-Test

Xdf = df[np.concatenate([
        categoricals,
        descriptions,
        numericals,
        dummies
])]
Ydf = df[target]

X_trainDf, X_testDf, y_trainDf, y_testDf = train_test_split(Xdf, Ydf, test_size=0.30)

In [5]:
# FIT (función) - Vectorizar categóricas, Vectorizar texto en Description y Normalizar continuas
vectorizer = TfidfVectorizer(min_df=5)
enc = DictVectorizer()
clf = Lasso(0.1, random_state=241,fit_intercept=True)
#lf = LassoCV(alphas=np.linspace(0.01,3, 10), cv=10, fit_intercept=True,n_jobs=-1)
normalizer = preprocessing.Normalizer()
poly = PolynomialFeatures(degree = 1, include_bias = False)

def fit_model(X_train, Y_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.fit_transform(texts[descriptions[0]])
    
    X_train_categ = enc.fit_transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = poly.fit_transform(normalizer.fit_transform(X_train[numericals]))
    
    X_dummies = X_train[dummies]

    X = hstack([X_tfidf, X_train_categ, X_numericas, X_dummies]) # regualirzar las numericas luego de splitear
    
    # train model on data
    clf.fit(X, Y_train)
    return X, Y_train

X__train, y__train = fit_model(X_trainDf, y_trainDf)

  y = column_or_1d(y, warn=True)


In [6]:
clf.alpha_

0.01

In [7]:
# TRANSFORM (función)
def transform_model(X_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.transform(texts[descriptions[0]])
    
    X_train_categ = enc.transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = poly.transform(normalizer.transform(X_train[numericals]))
    
    X_dummies = X_train[dummies]

    X = hstack([X_tfidf, X_train_categ, X_numericas, X_dummies])
    
    return X
X__test = transform_model(X_testDf)
X__test

<26910x21903 sparse matrix of type '<class 'numpy.float64'>'
	with 2151709 stored elements in COOrdinate format>

In [8]:
# Métricas
from sklearn import metrics
print ('Score de Entrenamiento: ',clf.score(X__train, y__train))
print ('Score de Test: ',clf.score(X__test, y_testDf))
print ('MAE: ', metrics.mean_absolute_error(y_testDf, clf.predict(X__test)))
print ('MSE: ', metrics.mean_squared_error(y_testDf, clf.predict(X__test)))
print ('RMSE: ', np.sqrt(metrics.mean_squared_error(y_testDf, clf.predict(X__test))))
print ('R2: ', metrics.r2_score(y_testDf, clf.predict(X__test)))

Score de Entrenamiento:  0.7966510082676138
Score de Test:  0.7347774279703913
MAE:  317.56599023775436
MSE:  223679.65825126864
RMSE:  472.9478388271466
R2:  0.7347774279703913


In [9]:
# Cross Validation CV=10
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X__test, y_testDf, cv=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.70773861, 0.70327   , 0.71180002, 0.68116877, 0.63150358,
       0.70530484, 0.67466508, 0.67986619, 0.711272  , 0.71044896])

In [19]:
# Ejemplo
print(clf.predict(X__test)[100])
print(y_testDf.iloc[100,0])

2165.8836119334965
2714.285714285714


### Crear dataset incluyendo predicciones y Ejericio Portafolio

In [None]:
# Reunir sets de entrenamiento y test
X_all, Y_all = fit_model(Xdf, Ydf)
# Calcular predicciones
valores = clf.predict(X_all)

  y = column_or_1d(y, warn=True)


In [None]:
# Volver a fittear el modelo ahora con todos los datos.
valores = clf.predict(X_all)

In [None]:
diferencia = Ydf - valores
diferencia.columns = ['diferencia_m2']
diferencia

diferencia.loc[:,'prediccion_m2'] = valores
diferencia.loc[:,'porcentaje_m2'] = Ydf / valores

dfR = df.join(diferencia)

dfR.loc[:,'prediccion_valor'] = dfR.prediccion_m2 * dfR.surface_total_in_m2

dfR = dfR[dfR.prediccion_m2 > 0]


portafolio = dfR.sample(n=100,random_state=100)

capital = portafolio.price_aprox_usd.sum()
print(capital)

dfR.sort_values(by='porcentaje_m2', ascending=True, inplace=True)
casas_compradas = pd.DataFrame([], columns=dfR.columns)
i = 0
ahorro = 0
while capital > 0:
    casa = dfR.iloc[i,:]
    valorCasa = casa.price_aprox_usd
    ahorro += casa.prediccion_valor - casa.price_aprox_usd
    casas_compradas = casas_compradas.append(casa)
    capital -= valorCasa
    i += 1
    
display(casas_compradas)

# Exportar casos a comprar a CSV
casas_compradas.to_csv('casas_compradas_lasso.csv')

print('Ahorro total: ', np.round(ahorro, 2))

In [None]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})

plt.subplots(figsize=(16,10))

sns.scatterplot(
    x="prediccion_m2", y="price_usd_per_m2", data=dfR, linewidth=0, alpha=0.3
)
sns.scatterplot(
    x="prediccion_m2", y="prediccion_m2", data=dfR, linewidth=0
)
sns.scatterplot(
    x="prediccion_m2", y="price_usd_per_m2", data=casas_compradas, linewidth=0, alpha=0.5
)

plt.show()

plt.subplots(figsize=(16,10))
sns.scatterplot(
        x="surface_total_in_m2", y="price_usd_per_m2",
        data=dfR,
        s=10, linewidth=0, alpha=1
)
sns.scatterplot(
        x="surface_total_in_m2", y="prediccion_m2",
        data=dfR,
        s=10, linewidth=0, alpha=0.05,
)
sns.scatterplot(
    x="surface_total_in_m2", y="price_usd_per_m2", data=casas_compradas, linewidth=0, alpha=0.5, s=30
)
plt.show()