### Importar Librerías

In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Importar el dataset

In [271]:
data = pd.read_csv('Train.csv', header = 0, skiprows=[2376])
data.shape

(36194, 18)

In [272]:
data.head()

Unnamed: 0,id,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,lng,lat,city,day_type
0,1,210.365854,Entire home/apt,False,False,2,False,0,1,9,91,0,0.541022,0.427919,-9.13395,38.71038,lisbon,weekday
1,2,176.181721,Private room,False,True,2,False,1,0,9,89,1,3.032839,0.343492,2.14165,41.37011,barcelona,weekday
2,3,142.05414,Private room,False,True,2,True,0,0,10,98,1,11.909712,6.710909,0.00475,51.44025,london,weekend
3,4,428.744524,Entire home/apt,False,False,4,False,0,0,9,94,0,4.15839,0.14557,2.30179,48.87297,paris,weekend
4,5,220.279802,Entire home/apt,False,False,6,True,0,1,10,93,2,1.30219,0.369475,23.72468,37.96746,athens,weekend


In [273]:
data.isna().sum()

id                            0
realSum                       0
room_type                     0
room_shared                   0
room_private                  0
person_capacity               0
host_is_superhost             0
multi                         0
biz                           0
cleanliness_rating            0
guest_satisfaction_overall    0
bedrooms                      0
dist                          0
metro_dist                    0
lng                           0
lat                           0
city                          0
day_type                      0
dtype: int64

In [274]:
data.dtypes

id                              int64
realSum                       float64
room_type                      object
room_shared                      bool
room_private                     bool
person_capacity                 int64
host_is_superhost                bool
multi                           int64
biz                             int64
cleanliness_rating              int64
guest_satisfaction_overall      int64
bedrooms                        int64
dist                          float64
metro_dist                    float64
lng                           float64
lat                           float64
city                           object
day_type                       object
dtype: object

In [275]:
data.describe()

Unnamed: 0,id,realSum,person_capacity,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,lng,lat
count,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0,36194.0
mean,18098.434381,279.386846,3.159391,0.293612,0.34898,9.389761,92.614218,1.157595,3.183128,0.676998,7.434832,45.65939
std,10448.558359,328.042369,1.294476,0.455423,0.476654,0.954591,8.917921,0.631141,2.383785,0.848039,9.810969,5.250253
min,1.0,39.009259,2.0,0.0,0.0,2.0,20.0,0.0,0.015045,0.00322,-9.22599,37.953
25%,9050.25,148.6452,2.0,0.0,0.0,9.0,90.0,1.0,1.452918,0.247775,-0.07154,41.39883
50%,18098.5,211.230343,3.0,0.0,0.0,10.0,95.0,1.0,2.60793,0.412522,4.87419,47.506315
75%,27146.75,319.05196,4.0,1.0,1.0,10.0,99.0,1.0,4.251491,0.732341,13.53301,51.469975
max,36195.0,16445.61469,6.0,1.0,1.0,10.0,100.0,10.0,22.617458,13.314115,23.78602,52.64141


### Separar el dataset

In [276]:
y = data.loc[:,'realSum']
data.drop(['realSum'],axis=1,inplace=True)
data.drop(['id'],axis=1,inplace=True)
# data.drop(['room_private'],axis=1,inplace=True)
# data.drop(['city'],axis = 1, inplace=True)
# data.drop(['lng'],axis=1,inplace=True)
# data.drop(['lat'],axis = 1, inplace=True)
# data.drop(['dist'],axis=1,inplace=True)
# data.drop(['metro_dist'],axis = 1, inplace=True)

In [277]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(data, y, test_size=0.3, random_state=20)

### Pipeline 

#### Variables Numericas

In [278]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
                            #('imputer', SimpleImputer(strategy="median")),
                            ('std_scaler', StandardScaler()),
                        ])

In [279]:
df_num_tr = num_pipeline.fit_transform(X_train.select_dtypes(include='number'))
df_num_tr[0,:]

array([-0.89433219, -0.64546835,  1.36647269, -0.40940374,  0.04198122,
       -0.24700926, -0.79763954, -0.32299744, -1.69119915, -1.31987981])

#### Variables categoricas

In [280]:
cat_pipeline = Pipeline([
                        #('imputer', SimpleImputer(strategy="most_frequent")),
                        ('oh_enc', OneHotEncoder())
                        ])

In [281]:
# test categorical pipeline
# --------------------------
df_cat_tr = cat_pipeline.fit_transform(data.select_dtypes(exclude='number'))

# review output from pipeline
df_cat_tr.toarray()[0,:]

array([1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0.])

#### Pipeline Completo

In [282]:
from sklearn.compose import ColumnTransformer

num_attribs = data.select_dtypes(include='number').columns
cat_attribs = data.select_dtypes(exclude='number').columns

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [283]:
full_pipeline.fit(X_train)
X_train_processed = full_pipeline.transform(X_train)
#X_train_processed = full_pipeline.fit_transform(X_train)

print(X_train_processed.shape)

X_train_processed[0,:]

(25335, 31)


array([-0.89433219, -0.64546835,  1.36647269, -0.40940374,  0.04198122,
       -0.24700926, -0.79763954, -0.32299744, -1.69119915, -1.31987981,
        0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ])

In [284]:
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

X = sm.add_constant(X_train_processed)

simple_model = sm.OLS(y_train, X)

simple_result = simple_model.fit();

# create instance
#forest_clf = RandomForestClassifier(random_state=42)

#forest_clf.fit(X_train_processed[:6000], y_train[:6000])

In [285]:
print(simple_result.summary())

                            OLS Regression Results                            
Dep. Variable:                realSum   R-squared:                       0.254
Model:                            OLS   Adj. R-squared:                  0.254
Method:                 Least Squares   F-statistic:                     375.7
Date:                Sat, 01 Apr 2023   Prob (F-statistic):               0.00
Time:                        00:22:02   Log-Likelihood:            -1.7757e+05
No. Observations:               25335   AIC:                         3.552e+05
Df Residuals:                   25311   BIC:                         3.554e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.708e+12   1.42e+14     -0.019      0.9

In [286]:
data_test = pd.read_csv('Test.csv')

In [288]:
ids_solution = data_test.id
data_test.drop('id',axis=1,inplace=True)

In [290]:
full_pipeline.fit(data_test)
X_test_processed = full_pipeline.fit_transform(data_test)
#X_train_processed = full_pipeline.fit_transform(X_train)

print(X_test_processed.shape)

X_test_processed[0,:]

(15513, 31)


array([-0.89218643, -0.63302634, -0.73873903,  0.63563945,  0.59260232,
       -0.26103253,  0.82991213, -0.61290559, -0.52392141,  0.6046628 ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ])

In [291]:
#preds_solution = simple_model.predict(X_test_processed)
preds_solution = simple_result.predict(sm.add_constant(X_test_processed))

In [295]:
solution_array = []

for i in range(0, len(ids_solution)):
    solution_array1 = [ids_solution[i],preds_solution[i]]
    solution_array.append(solution_array1)

In [296]:
solution = pd.DataFrame(solution_array, columns = ['id','realSum'])
solution.head(3)

Unnamed: 0,id,realSum
0,36196,377.504028
1,36197,456.759888
2,36198,713.254181


In [297]:
solution.to_csv('solutions.csv', index=False)