In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns

# Importo la Data

In [2]:
df = pd.read_csv("ProjectClean2.csv")

## Para evitar redundancia en el modelo, hemos decidido eliminar las columnas latitud y longitud.

In [5]:
# Eliminar las columnas 'latitud' y 'longitud' 
df.drop('latitude', axis=1, inplace=True)
df.drop('longitude', axis=1, inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != "price"]
y = df.loc[:, "price"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size = 0.75,
    test_size = 0.25
)

In [9]:
X_train.shape, X_test.shape

((11391, 33), (3798, 33))

In [10]:
y_test.shape, y_train.shape

((3798,), (11391,))

In [23]:
independent_vars = [
    'host_since', 'host_location', 'host_is_superhost',
    'host_identity_verified', 'host_has_profile_pic', 'neighbourhood',
    'room_type', 'accommodates', 'bathrooms',
    'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
    'has_availability', 'availability_30', 'availability_60',
    'number_of_reviews', 'last_review', 'review_scores_rating',
    'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location',
    'review_scores_value', 'reviews_per_month', 'instant_bookable',
    'bathroom_type', 'has_license', 'has_terrace', 'has_wifi',
    'pet_allowed', 'has_AC', 'has_kitchen'
]

In [24]:
#Creo el modelo
modelo = smf.ols(
    formula = 'price ~ ' + ' + '.join(independent_vars),
    data = df
).fit()

p_valores = modelo.pvalues
variables_significativas = p_valores<0.05

In [25]:
variables_significativas = p_valores[p_valores < 0.05]
variables_significativas

Intercept                                1.336092e-03
host_location[T.other]                   1.969965e-03
host_location[T.spain]                   1.008554e-03
neighbourhood[T.horta_guinardo]          1.806823e-02
neighbourhood[T.les_corts]               9.775952e-04
neighbourhood[T.sant_andreu]             2.780279e-03
neighbourhood[T.sarria_sant_gervasi]     7.787784e-05
room_type[T.private_room]                1.365762e-07
room_type[T.shared_room]                 9.514617e-04
instant_bookable[T.True]                 4.725528e-05
has_license[T.True]                      5.692489e-11
pet_allowed[T.True]                      2.161577e-02
has_AC[T.True]                           3.696982e-03
has_kitchen[T.True]                      5.839830e-06
accommodates                             1.650411e-70
bathrooms                               1.553889e-130
minimum_nights                           2.718831e-29
availability_30                          3.178370e-06
availability_60             

In [26]:
variables_no_significativas = p_valores[p_valores >= 0.05]
variables_no_significativas

host_is_superhost[T.True]          0.087680
host_identity_verified[T.True]     0.953735
host_has_profile_pic[T.True]       0.319583
neighbourhood[T.eixample]          0.409411
neighbourhood[T.gracia]            0.182987
neighbourhood[T.nou_barris]        0.299850
neighbourhood[T.sant_marti]        0.099752
neighbourhood[T.sants_montjuic]    0.707648
room_type[T.hotel_room]            0.438115
has_availability[T.True]           0.775000
has_terrace[T.True]                0.195809
has_wifi[T.True]                   0.142826
host_since                         0.424264
bedrooms                           0.162327
beds                               0.628190
maximum_nights                     0.576935
number_of_reviews                  0.168614
review_scores_rating               0.678148
review_scores_checkin              0.696127
review_scores_communication        0.964076
review_scores_location             0.116928
review_scores_value                0.755656
reviews_per_month               

In [None]:
col_relevantes.index

In [None]:
col_relevantes.dtype

In [None]:
columnas = col_relevantes.index

In [None]:
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
for i in columnas:

    x = X_train.loc[:,X_train.columns != i]
    y = X_train.loc[:,i]

    lm = LinearRegression()
    lm.fit(x, y)
    r2 = lm.score(x, y)

    vif = (1/(1-r2))
    vif_acum[i] = vif

vif_acum

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=4) # backward selection (recursivo)
selector = selector.fit(X_train,y_train)

In [None]:
selector.support_
X_train_final = X_train.loc[:,selector.support_]
print(X_train_final.columns)
lm = LinearRegression()
lm.fit(X_train_final, y_train)
lm.score(X_test.loc[:,X_train_final.columns], y_test)

In [None]:
X_train_final = X_train.loc[:,['accommodates', 'bedrooms', 'beds', 'minimum_nights','room_type_private_room']]
print(X_train_final.columns)
lm = LinearRegression()
lm.fit(X_train_final, y_train)
lm.score(X_test.loc[:,X_train_final.columns], y_test)