In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
#from scipy import stats
#import matplotlib.pyplot as plt
#import seaborn as sns
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
#from sklearn.preprocessing import PolynomialFeatures, StandardScaler
#from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing

In [2]:
# Importar pickle con el dataset preprocesado
df = pd.read_pickle('DatasetFinalParaEntrenar.pkl')

### Fit & Transform

In [3]:
# Separar Variables por tipo
categoricals = ['property_type','place_name','state_name','zona']
descriptions = ['texto']
numericals = ['surface_total_in_m2', 'surface_covered_in_m2', 'floor', 'rooms', 'expenses','lat','lon']
dummies = ['subte','colectivo','balcon','tren','porton','financia', 'parrilla','pileta', 'solarium', 'lavadero', 'estacionamiento', 'no_apto_credito','apto_credito', 'amplioliving', 'cocheras', 'frente', 'contrafrente','seguridad', 'amenities', 'SUM', 'espaciosverdes','jacuzzi','estrenar','aptoprofesional','pozo','categoria','reciclado','luminoso','acondicionado','quincho','escalera']
target = ['price_usd_per_m2']

In [4]:
# Separar Train-Test

Xdf = df[np.concatenate([
        categoricals,
        descriptions,
        numericals,
        dummies
])]
Ydf = df[target]

X_trainDf, X_testDf, y_trainDf, y_testDf = train_test_split(Xdf, Ydf, test_size=0.30)

In [5]:
# FIT (función) - Vectorizar categóricas, Vectorizar texto en Description y Normalizar continuas
from sklearn.linear_model import ElasticNetCV
vectorizer = TfidfVectorizer(min_df=5)
enc = DictVectorizer()
clf = Ridge(0.8255102040816327, random_state=241,fit_intercept=True)
#clf = RidgeCV(alphas=np.linspace(0.01,10, 50), cv=10, fit_intercept=True)
normalizer = preprocessing.Normalizer()

def fit_model(X_train, Y_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.fit_transform(texts[descriptions[0]])
    
    X_train_categ = enc.fit_transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = normalizer.fit_transform(X_train[numericals])
    
    X_dummies = X_train[dummies]

    X = hstack([X_tfidf, X_train_categ, X_numericas, X_dummies]) # regualirzar las numericas luego de splitear
    
    # train model on data
    clf.fit(X, Y_train)
    return X, Y_train

X__train, y__train = fit_model(X_trainDf, y_trainDf)

In [6]:
# TRANSFORM (función)
def transform_model(X_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.transform(texts[descriptions[0]])
    
    X_train_categ = enc.transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = normalizer.transform(X_train[numericals])
    
    X_dummies = X_train[dummies]

    X = hstack([X_tfidf, X_train_categ, X_numericas, X_dummies])
    
    return X
X__test = transform_model(X_testDf)
X__test

<26910x21984 sparse matrix of type '<class 'numpy.float64'>'
	with 2184244 stored elements in COOrdinate format>

In [7]:
# Métricas
from sklearn import metrics
print ('Score de Entrenamiento: ',clf.score(X__train, y__train))
print ('Score de Test: ',clf.score(X__test, y_testDf))
print ('MAE: ', metrics.mean_absolute_error(y_testDf, clf.predict(X__test)))
print ('MSE: ', metrics.mean_squared_error(y_testDf, clf.predict(X__test)))
print ('RMSE: ', np.sqrt(metrics.mean_squared_error(y_testDf, clf.predict(X__test))))
print ('R2: ', metrics.r2_score(y_testDf, clf.predict(X__test)))

Score de Entrenamiento:  0.8128122239611268
Score de Test:  0.7454322977694718
MAE:  317.6676198344248
MSE:  223624.2421260136
RMSE:  472.88924932378575
R2:  0.7454322977694718


In [8]:
# Cross Validation CV=10
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X__test, y_testDf, cv=10)

array([0.72146959, 0.6975723 , 0.7546028 , 0.71588976, 0.70141731,
       0.76253329, 0.65348178, 0.71472856, 0.73529096, 0.68553995])

In [9]:
# Ejemplo
print(clf.predict(X__test)[100][0])
print(y_testDf.iloc[100,0])

1176.1948282136955
1219.5121951219512


In [10]:
from joblib import dump
dump(clf, 'model_1.joblib')

['model_1.joblib']