In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

# Importo librerias de Scikit Learn
Puntualmente nos interesa importar el modelo de Regresion Lineal, la medida de error cuadratico medio, y el divisor de datos entre train y test.

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV

# Importo dataset de Airbnb 
Desde Airbnb obtenemos el siguiente dataset para entrenar nuestros modelos de regresion http://insideairbnb.com/get-the-data.html (dataset "listings.csv" de London). 

Objetivo: Vamos a querer predecir el precio dadas ciertas features.

In [None]:
pwd

In [3]:
# importo el dataset de Airbnb London, lo guardamos en el dataframe "london".
london = pd.read_csv(r'clusterai_clase04_regresion_dataset_airbnb_london.csv', delimiter=',', parse_dates = True)

In [4]:
# observo la cantidad de renglones y columnas del dataset importado
np.shape(london)

(75213, 16)

In [5]:
# exploro las columnas que tiene el dataframe importado
london.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [6]:
#visualizo los primeros 3 renglones de mi dataset
# Predecir el precio de cada lugar en función de las features del dataset
london.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9554,"Cozy, 3 minutes to Piccadilly Line",31655,Guy,,Haringey,51.587767,-0.105666,Private room,35,1,131,2018-08-03,1.71,4,262
1,11076,The Sanctuary,40471,Rosa,,Ealing,51.515645,-0.314508,Private room,70,2,2,2016-11-23,0.07,6,62
2,13913,Holiday London DB Room Let-on going,54730,Alina,,Islington,51.568017,-0.111208,Private room,45,1,14,2018-06-17,0.14,2,364


In [7]:
# cuento cuantos NaNs tengo por columna
london.isnull().sum()

id                                    0
name                                 31
host_id                               0
host_name                            13
neighbourhood_group               75213
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       20353
reviews_per_month                 20357
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [8]:
# elimino las rows que tienen NaNs en la columna "reviews_per_month" , que otras opciones existen?
london = london.dropna(subset = ["reviews_per_month"])

In [9]:
# reviso como queda mi dataframe luego de eliminar los renglones con NaNs en review per month
london.shape

(54856, 16)

## Visualización rápida de las features de interes con Pairplot (Seaborn)

In [None]:
sns.pairplot(london[["price", "minimum_nights", "number_of_reviews", "reviews_per_month", "availability_365"]])
plt.show()

## Elimino con .drop las columnas que no son de interes

In [10]:
london = london.drop(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'last_review', 'latitude', 'longitude'], axis=1)
#Que features que eliminamos podrian llegar a ser utiles?

Link util -->
https://medium.com/@khadijamahanga/using-latitude-and-longitude-data-in-my-machine-learning-problem-541e2651e08c

In [11]:
london.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Haringey,Private room,35,1,131,1.71,4,262
1,Ealing,Private room,70,2,2,0.07,6,62
2,Islington,Private room,45,1,14,0.14,2,364


## Reviso si los valores máximos de cada feature se condicen con el contexto

Con que objetivo queremos ver esta informacion?

In [12]:
np.max(london)

# y, veo medio raro el precio, digo "los saco"

neighbourhood                     Westminster
room_type                         Shared room
price                                   10000
minimum_nights                           1000
number_of_reviews                         536
reviews_per_month                       15.56
calculated_host_listings_count           1034
availability_365                          365
dtype: object

## Obtengo los percentiles 97 de price y Minimum nights para filtrar outliers de estas features

In [13]:
# Hacer las assumptions (hipótesis) con fundamentos.
# Acá es que los valores alejados del percentil 97 son outliers, me van a ensuciar el dataset

price_q97 = london.price.quantile(0.975)
print("el cuantil 0.97 de la feature 'price' es = " + str(price_q97))

el cuantil 0.97 de la feature 'price' es = 300.0


In [14]:
min_nights_q97 = london.minimum_nights.quantile(0.975)
print("el cuantil 0.97 de la feature 'minimum_nights' es = " + str(min_nights_q97))

el cuantil 0.97 de la feature 'minimum_nights' es = 14.0


## Outlier Filtering: Filtro mi dataset por los percentiles calculados en el paso anterior bajo las features correspondientes

In [15]:
# aca filtro y conservo los valores que cumplen las dos condiciones al mismo tiempo
london_filt = london.loc[(london.price < price_q97) & (london.minimum_nights < min_nights_q97)]

In [16]:
london_filt.shape

(51827, 8)

In [17]:
london_filt.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Haringey,Private room,35,1,131,1.71,4,262
1,Ealing,Private room,70,2,2,0.07,6,62
2,Islington,Private room,45,1,14,0.14,2,364


## Feature Engineering: Genero variables dummies para las features categoricas (Neighbourhood y Room Type)

In [18]:
# dummies para la feature "neighbourhood"
neighs_dummie = pd.get_dummies(london_filt.neighbourhood)
neighs_dummie.head(3)

# Me va a generar una columna por cada valor de neighbor

Unnamed: 0,Barking and Dagenham,Barnet,Bexley,Brent,Bromley,Camden,City of London,Croydon,Ealing,Enfield,...,Merton,Newham,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
neighs_dummie.shape

(51827, 33)

In [20]:
# Ejemplo de dummies para la feature "room_type"
room_dummie = pd.get_dummies(london_filt.room_type)
room_dummie.head(3)

Unnamed: 0,Entire home/apt,Private room,Shared room
0,0,1,0
1,0,1,0
2,0,1,0


In [21]:
room_dummie.shape

(51827, 3)

## Agrego las nuevas variables dummies creadas al dataframe de trabajo 'london_filt'

In [22]:
london_filt = london_filt.join([neighs_dummie, room_dummie])

In [23]:
london_filt.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Barking and Dagenham,Barnet,...,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster,Entire home/apt,Private room,Shared room
0,Haringey,Private room,35,1,131,1.71,4,262,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Ealing,Private room,70,2,2,0.07,6,62,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Islington,Private room,45,1,14,0.14,2,364,0,0,...,0,0,0,0,0,0,0,0,1,0


In [24]:
print("La cantidad de features nuevas agregadas a dataframe son = " + str(np.shape(neighs_dummie)[1]+np.shape(room_dummie)[1]))

La cantidad de features nuevas agregadas a dataframe son = 36


## Creamos nuestra variable dependiente (label - etiqueta) y

In [25]:
y = np.array(london_filt[["price"]])

## Creamos nuestra variable Independiente X quitando las features que no deseamos

In [26]:
x = london_filt.drop(['price', 'neighbourhood','room_type'], axis=1)

# Agarro las x por exclusión, vecindario y tipo de room las tengo en dummies

In [27]:
np.shape(x)

(51827, 41)

## Divido el dataset entre Train y Test

In [28]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30, random_state=42)

In [29]:
xtrain.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Barking and Dagenham,Barnet,Bexley,Brent,Bromley,...,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster,Entire home/apt,Private room,Shared room
58162,2,21,4.2,3,2,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
41596,2,5,0.44,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
48432,1,73,7.79,6,306,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
59321,1,23,6.0,6,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
24224,2,29,1.12,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
xtrain.shape

(36278, 41)

In [31]:
# Step 4: auto scaling train- set (mean = 0, std = 1)
scaler = preprocessing.StandardScaler().fit(xtrain)
scaler

StandardScaler()

In [32]:
# auto scalo mis muestras de train utilizando el scaler fiteado con el xtrain
xtrain_scal = scaler.transform(xtrain)  

In [33]:
# auto scalo mis muestras de test utilizando el scaler fiteado con el xtrain
xtest_scal = scaler.transform(xtest)  

# No puedo escalar todo junto porque sino estaría asumiendo que la X train
# y la X test tienen la misma distribución implícita

# Creo un modelo de Regresión Lineal

In [34]:
# 1) creo un modelo generico de regresion lineal
# Agarro el LR porque es rápido, como para tener una base, y después construir sobre eso
lr = LinearRegression()

In [35]:
# 2) Ajusto el modelo de regresion lineal utilizando el set de train, tanto las features X como las samples Y
lr.fit(xtrain_scal, ytrain)

LinearRegression()

In [36]:
# 3) Obtengo las predicciones que realiza mi modelo con las muestras de test, sin mostrarle las labels (Ytest)
# las predicciones las guardo en el vector "ypred"
ypred = lr.predict(xtest_scal)

In [37]:
np.shape(ypred)

(15549, 1)

In [38]:
# calculo el error de mi modelo con las muestras de train = error de train
np.sqrt(mean_squared_error(ytest, ypred))

38.9873071133408

In [39]:
mean_squared_error(ytest, ypred)

1520.010115949954

In [40]:
from sklearn.metrics import mean_absolute_error

In [41]:
mean_absolute_error(ytest, ypred)

27.61527440186465

![Inner join](https://i.imgflip.com/1vgxmu.jpg)

## Armar tres modelos de regresion usando:

    -KNN Regression
    Utilizar un GridSearchCV: como base podes utilizar los siguientes parametros
    parameters_k = np.arange(20,31,5)
    parameters_knn = [{'n_neighbors': parameters_k}]
    
    https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html?highlight=knn
    
    -SVR
    Utilizar un GridSearchCV: como base podes utilizar los siguientes parametros:
    ​parameters_svr_rbf = [{'kernel':['rbf'] , 'C': [1,100],'gamma': [0.1,0.5] }]
    
    -Random Forest regressor
    Utilizar un GridSearchCV: como base podes utilizar los siguientes parametros:
    'n_estimators': [100, 200, 300, 1000]
    
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor.set_params

### Comparar el R2, RMSE, MAE

Competencia

Mayor R2 = { }

Menor RMSE = { }

Menor MAE = { }

## KNN regression

In [42]:
knn=KNeighborsRegressor()

In [43]:
knn.fit(xtrain,ytrain)

KNeighborsRegressor()

In [44]:
parameters_k = np.arange(1,30,2)
parameters_knn = [{'n_neighbors': parameters_k}]

In [45]:
clasif=GridSearchCV(knn,param_grid=parameters_knn,refit=True,
                  cv=5, verbose=3, n_jobs=3)

In [46]:
clasif.fit(xtrain_scal, ytrain)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  5.6min
[Parallel(n_jobs=3)]: Done  75 out of  75 | elapsed: 16.1min finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=3,
             param_grid=[{'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29])}],
             verbose=3)

In [47]:
print(clasif.best_estimator_,)
print(clasif.best_params_, "\n")
print(clasif.best_score_, "\n")

KNeighborsRegressor(n_neighbors=29)
{'n_neighbors': 29} 

0.5175310741732008 



In [48]:
prediction = clasif.best_estimator_.predict(xtest_scal)
pred_r2 = r2_score(y_true=ytest, y_pred=prediction)
pred_mse = mean_squared_error(y_true=ytest, y_pred=prediction)
pred_mae = mean_absolute_error(y_true=ytest, y_pred=prediction)

In [49]:
results_df = pd.DataFrame(columns=['Model','R2','MSE','MAE'])

In [50]:
results_df = results_df.append({'Model':'KNN reg',
                                'R2':pred_r2,
                                'MSE':pred_mse,
                                'MAE':pred_mae},ignore_index=True)

In [51]:
results_df

Unnamed: 0,Model,R2,MSE,MAE
0,KNN reg,0.522321,1487.631309,26.792669


## SVR

In [52]:
estimador = SVR()

parameters_svr_rbf = [{'kernel':['linear'] , 
                       'C': [0,100],
                       'gamma': [0.1,0.5] }]

clasif2=GridSearchCV(estimador, param_grid=parameters_svr_rbf, refit=True, cv=5)
clasif2.fit(xtrain_scal,ytrain)

  return f(**kwargs)
Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "D:\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 268, in _dense_fit
    self._probB, self.fit_status_ = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 191, in sklearn.svm._libsvm.fit
ValueError: C <= 0

  return f(**kwargs)
Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 217, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "D:\Anaconda\lib\site-packages\sklearn\svm\_base.py

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


KeyboardInterrupt: 

In [None]:
print(clasif2.best_estimator_, "\n")
print(clasif2.best_params_, "\n")
print(clasif2.best_score_, "\n")

In [None]:
prediction2 = clasif2.best_estimator_.predict(xtest_scal)
svr_r2 = r2_score(y_true=ytest, y_pred=prediction2)
svr_mse = mean_squared_error(y_true=ytest, y_pred=prediction2)
svr_mae = mean_absolute_error(y_true=ytest, y_pred=prediction2)

In [None]:
results_df = results_df.append({'Model':'SVR reg',
                                'R2':svr_r2,
                                'MSE':svr_mse,
                                'MAE':svr_mae},ignore_index=True)

In [None]:
results_df