In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing

df = pd.read_pickle('dataset.pkl')
df.head(5)
df['surface_covered_in_m2'].fillna(df['surface_total_in_m2'], inplace=True)


# Fill NaNs
df["surface_covered_in_m2"] = np.where(df["surface_covered_in_m2"].isnull(),df["surface_total_in_m2"],df["surface_covered_in_m2"])
df["floor"] = np.where(df["floor"].isnull(),0,df["floor"])
df["rooms"] = np.where(df["rooms"].isnull(),1,df["rooms"])
df["expenses"] = np.where(df["expenses"].isnull(),0,df["expenses"])


pti = df
pti.drop(['geonames_id', 'lat', 'lon', 'properati_url', 'zona'], axis=1, inplace=True)
pti.dropna(how='any', inplace=True)
display(pti.isna().sum())
pti.head()

property_type            0
place_name               0
state_name               0
price_aprox_usd          0
surface_total_in_m2      0
surface_covered_in_m2    0
price_usd_per_m2         0
floor                    0
rooms                    0
expenses                 0
description              0
title                    0
dtype: int64

Unnamed: 0,property_type,place_name,state_name,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,floor,rooms,expenses,description,title
0,PH,Mataderos,Capital Federal,62000.0,55.0,40.0,1127.272727,0.0,1.0,0.0,"2 AMBIENTES TIPO CASA PLANTA BAJA POR PASILLO,...",2 AMB TIPO CASA SIN EXPENSAS EN PB
2,apartment,Mataderos,Capital Federal,72000.0,55.0,55.0,1309.090909,0.0,1.0,0.0,2 AMBIENTES 3ER PISO LATERAL LIVING COMEDOR AM...,2 AMB 3ER PISO CON ASCENSOR APTO CREDITO
4,apartment,Centro,Buenos Aires Costa Atlántica,64000.0,35.0,35.0,1828.571429,0.0,1.0,0.0,DEPARTAMENTO CON FANTÁSTICA ILUMINACIÓN NATURA...,DEPTO 2 AMB AL CONTRAFRENTE ZONA CENTRO/PLAZA ...
5,house,Gualeguaychú,Entre Ríos,29724.34828,53.0,53.0,560.83676,0.0,1.0,0.0,"Casa en el perímetro del barrio 338, ubicada e...","Casa Barrio 338. Sobre calle 3 de caballería, ..."
6,PH,Munro,Bs.As. G.B.A. Zona Norte,130000.0,106.0,78.0,1226.415094,0.0,1.0,0.0,MUY BUEN PH AL FRENTE CON ENTRADA INDEPENDIENT...,"MUY BUEN PH AL FRENTE DOS DORMITORIOS , PATIO,..."


In [2]:
for x in range(2):
    print(df.sample(frac=0.1, replace=True).iloc[x].description + '\n')

Departamento de 2 ambientes en las Torres Les Bruyeres.  Living comedor con balcón, dormitorio con placard , baño completo con bañera, cocina con barra y muebles bajo y sobre mesada,  NO ES APTO CRÉDITO. El complejo posee: SUM, piscina, gimnasio, sauna, ducha escocesa, sala de masajes y seguridad las 24hs. Calefacción por radiadores. IMPECABLEOpción  de adquirir  1 cochera (u$s 15.000)Para consultas telefónicas tenga este código a mano: PAP441320

Casa desarrollada en una planta con amplio garage. Living-comedor. 3 dormitorios, baño completo. Cocina-comedor diario con muebles bajos y artefacto comun. Dependencia de servicio. Lavadero independiente. Patio con parrilla y cuarto de herramientas. Ideal inversor o constructor. "Los datos , medidas y antigüedad son estimativos y suministrados por el propietario".Código de propiedad: DHO322610Horacio Igarzabal CMCPSI Mat Nº 3154 DIC PROPIEDADES S.A. - 



In [87]:
categoricals = ['property_type','place_name','state_name']
descriptions = ['description']
numericals = ['surface_total_in_m2', 'surface_covered_in_m2', 'floor', 'rooms', 'expenses',]
target = ['price_usd_per_m2']

Xdf = pti[np.concatenate([
        categoricals,
        descriptions,
        numericals
])]
Ydf = pti[target]

X_trainDf, X_testDf, y_trainDf, y_testDf = train_test_split(Xdf, Ydf, test_size=0.30)
X_trainDf.head()

Unnamed: 0,property_type,place_name,state_name,description,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses
9865,apartment,Nordelta,Bs.As. G.B.A. Zona Norte,CON RENTA!!! Excelente departamento monoambien...,40.0,40.0,3.0,1.0,0.0
46019,house,Concepción del Uruguay,Entre Ríos,"Excelente propiedad, consta de 1 casa y 2 terr...",765.0,175.0,0.0,4.0,0.0
31852,house,San Miguel,Bs.As. G.B.A. Zona Norte,Venta de casa americana 4 ambientes en San Mig...,991.0,991.0,0.0,3.0,0.0
39616,house,Parque Leloir,Bs.As. G.B.A. Zona Oeste,"Venta de Casa 4 AMBIENTES en Parque Leloir, It...",1050.0,1050.0,0.0,1.0,0.0
95348,house,Martínez,Bs.As. G.B.A. Zona Norte,Chalet a reciclar en excelente ubicacion y ent...,117.0,117.0,0.0,1.0,0.0


In [88]:
vectorizer = TfidfVectorizer(min_df=5)
enc = DictVectorizer()
clf = Ridge(alpha=2.5, random_state=241)
# clf = RidgeCV(alphas=np.linspace(0.01,10, 5), cv=5)
normalizer = preprocessing.Normalizer()

def fit_model(X_train, Y_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.fit_transform(texts[descriptions[0]])
    
    X_train_categ = enc.fit_transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = normalizer.fit_transform(X_train[numericals])

    X = hstack([X_tfidf, X_train_categ, X_numericas]) # regualirzar las numericas luego de splitear
#     X = hstack([X_train_categ, X_numericas]) # regualirzar las numericas luego de splitear

    
    # train model on data
    clf.fit(X, Y_train)
    return X, Y_train

X__train, y__train = fit_model(X_trainDf, y_trainDf)

In [89]:
def transform_model(X_train):
    texts = X_train[descriptions]
    texts = texts.applymap(lambda x: x.lower())
    texts = texts.replace('[^a-zA-Z0-9]', ' ', regex = True)

    X_tfidf = vectorizer.transform(texts[descriptions[0]])
    
    X_train_categ = enc.transform(X_train[categoricals].to_dict('records'))
    
    X_numericas = normalizer.transform(X_train[numericals])

    X = hstack([X_tfidf, X_train_categ, X_numericas])
#     X = hstack([X_train_categ, X_numericas])
    
    return X
X__test = transform_model(X_testDf)
X__test

<26916x21444 sparse matrix of type '<class 'numpy.float64'>'
	with 1922750 stored elements in COOrdinate format>

In [90]:
clf.score(X__train, y__train)
from sklearn.metrics import mean_squared_error as rmse
from sklearn.metrics import r2_score

np.sqrt(rmse(y_testDf, clf.predict(X__test)))

481.45907039989413

In [91]:
clf.score(X__test, y_testDf)

0.7324124997776109

In [92]:
print(clf.predict(X__test)[0][0])
print(y_testDf.iloc[0,0])

1709.0876843552016
1833.333333333333


In [63]:
clf.alpha_

AttributeError: 'Ridge' object has no attribute 'alpha_'

In [66]:
pti['surface_total_in_m2'].values.reshape(-1,1).shape

(89720, 1)

In [72]:
import re
def searchReg(exp):
    return (df['title'].str.contains(exp, case=False, regex=True) | df['description'].str.contains(exp, case=False, regex=True))
def extractCat(exp):
    found = searchReg(exp).astype(int)
    return found
def test_addDmmy(termns):
    for i in termns:
        catName = 'has_'+i[0]
        if(catName == False):
            catName = 'has_'+exp
        extracted = extractCat(i[0])
        print('extracting '+catName+':', extracted.sum() )
def addDmmy(termns):
    for i in termns:
        catName = 'has_'+i[0]
        if(catName == False):
            catName = 'has_'+exp
        extracted = extractCat(i[0])
        print('extracting '+catName+':', extracted.sum() )
        df[catName] = extracted
# addDmmy([
#     ['subte'],
#     ['balcon'],
#     ['tren'],
#     ['port[oó]n'],
#     ['(?:financiaci[oó]n)|(?:financiamiento)'],
#     ['parr?ill?a', 'has_parrilla'],
#     ['(?:pileta)|(?:piscina)', 'has_pileta'],
#     ['solarium'],
#     ['lavadero'],
#     ['estacionamiento'],
#     ['NO (?:ES )?APTO CR[EÉ]DITO', 'no_apto_credito'],
#     ['estacionamiento'],
#     ['amplio living'],
#     ['cocheras?', 'has_cocheras'],
#     [' frente[ .,]', 'is_frente'],
#     ['contrafrente', 'is_contrafrente'],
#     ['(?:seguridad)|(?:control)', 'has_seguridad'],
#     ['(?:sin)|(?:no hay) expensas', 'no_expensas']
# ])
df.head(5)
test_addDmmy([['filos']])

extracting has_filos: 143


In [75]:
df["surface_covered_in_m2"] = np.where(df["surface_covered_in_m2"].isnull(),df["surface_total_in_m2"],df["surface_covered_in_m2"])
df["floor"] = np.where(df["floor"].isnull(),0,df["floor"])
df["rooms"] = np.where(df["rooms"].isnull(),1,df["rooms"])
# df["expenses"] = np.where(df["expenses"].isnull(),0,df["expenses"])




X = df.drop(['geonames_id','lat','lon','price_aprox_usd','title','description','properati_url',\
         'property_type','place_name','state_name','zona', 'expenses'],axis=1)

print(X.isna().any())

X = PolynomialFeatures(2,include_bias=False,interaction_only=False).fit_transform(X)

X = pd.concat([ X,
           pd.get_dummies(df['property_type'],prefix='prop_type',drop_first=True),
           pd.get_dummies(df['state_name'],prefix='state_name',drop_first=True),
           pd.get_dummies(df['zona'],prefix='zona',drop_first=True)
    ], axis=1)

# X = PolynomialFeatures(2,include_bias=False,interaction_only=False).fit_transform(X)
# X = pd.concat([X,
#            pd.get_dummies(df['property_type'],prefix='prop_type',drop_first=True),
# #            pd.get_dummies(df['place_name'],prefix='place_name',drop_first=True),
#            pd.get_dummies(df['state_name'],prefix='state_name',drop_first=True),
#            pd.get_dummies(df['zona'],prefix='zona',drop_first=True)\
#           ])


# df_y = df['price_usd_per_m2']
# X = X.drop(['price_usd_per_m2'],axis=1)

surface_total_in_m2      False
surface_covered_in_m2    False
price_usd_per_m2         False
floor                    False
rooms                    False
dtype: bool


TypeError: cannot concatenate object of type "<class 'numpy.ndarray'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

In [45]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df_x, df_y,random_state=1)

from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(Xtrain, ytrain)
print(model.score(Xtrain, ytrain))

ymodel = model.predict(Xtest)

from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
print(r2_score(ytest,ymodel))
print(explained_variance_score(ytest,ymodel))

0.48922960819531885
0.4826518286711198
0.4827072368361335


In [25]:
print(len(df))

# df.expenses = df.expenses.fillna(value=0)

cols = ['surface_total_in_m2', 'place_name', 'price_usd_per_m2', 'property_type', 'zona', 'rooms', 'state_name']

print(len(df[cols].dropna()))

data = df[cols].dropna()
data.head(2)

89721
56308


Unnamed: 0,surface_total_in_m2,place_name,price_usd_per_m2,property_type,zona,rooms,state_name
10,1514.0,Córdoba,46.235139,house,Interior,4.0,Córdoba
13,50.0,Palermo Soho,2234.0,apartment,Capital & GBA,1.0,Capital Federal


In [68]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=4, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.place_name)
  
y = df.loc[:,['surface_total_in_m2', 'rooms']]

y = pd.concat( [y, pd.get_dummies(df[['property_type', 'state_name', 'place_name']], drop_first=True)] , axis=1)

x = df['price_usd_per_m2'].values.reshape(-1,1)

y.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,surface_total_in_m2,rooms,property_type_apartment,property_type_house,property_type_store,state_name_Bs.As. G.B.A. Zona Oeste,state_name_Bs.As. G.B.A. Zona Sur,state_name_Buenos Aires Costa Atlántica,state_name_Buenos Aires Interior,state_name_Capital Federal,...,place_name_Virrey del Pino,place_name_Virreyes,place_name_Vistalba,place_name_Wenceslao Escalante,place_name_Wilde,place_name_William Morris,place_name_Yerba Buena,place_name_Zelaya,place_name_Zárate,place_name_coordenadas 34.255511
0,55.0,,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,55.0,,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,35.0,,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,53.0,,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,106.0,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)


reg = LinearRegression()
reg.fit(x_train, y_train)

print(reg.score(x_train, y_train))


list(zip(y.columns, reg.coef_))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [70]:
from sklearn.model_selection import validation_curve
grados_a_testear = np.arange(0, 6)
train_score, val_score = validation_curve(PolynomialRegression(), x, y, 'polynomialfeatures__degree'
                                          , grados_a_testear, cv=7) #cv= validacion cruzada, cuanta cantidad quiero

print(train_score[:3])
print(val_score[:3])

plt.plot(grados_a_testear, np.mean(train_score, axis=1), color='blue', label='training score')
plt.plot(grados_a_testear, np.mean(val_score, axis=1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').