In [None]:
import pandas as pd
from os import walk

In [None]:
datasets_names = []
path_df = '../datasets/original_files/'
for (dirpath, dirnames, filenames) in walk(path_df):
    datasets_names.extend(filenames)
    break
datasets_names

In [None]:
dataFrame = False
first = True
for name in datasets_names:
    if(first):
        first = False
        dataFrame = pd.read_csv(path_df+name, compression='gzip')
        dataFrame.set_index('id', drop=False, inplace=True)
        print('adding', len(dataFrame), 'rows')
    else:
        _tmpDf = pd.read_csv(path_df+name, compression='gzip')
        _tmpDf.set_index('id', drop=False, inplace=True)
        print('adding', len(_tmpDf), 'rows')
        dataFrame = pd.concat([dataFrame, _tmpDf])
        del _tmpDf
print('total rows:', len(dataFrame))
dataFrame.head()

In [None]:
#cleaning columns
for x in dataFrame.columns:
    print(x)
    values = dataFrame[x][pd.notna(dataFrame[x])].values
    if(len(values) > 0):
        print(values[0])
    else:
        print('NOT_VALUES')
    print('')

In [None]:
# neighbourhood
# neighbourhood_cleansed
# neighbourhood_group_cleansed
# guests_included
# license
# is_business_travel_ready

cols_to_drop = [
    'market', 'street',
    'listing_url', 'scrape_id', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url',
    'host_name', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
    'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'zipcode', 'smart_location', 'country_code',
    'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people',
    'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
    'requires_license','jurisdiction_names','host_location'
]
# [colName for colName in dataFrame.columns if colName not in cols_to_drop]


# dataFrame.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
import numpy as np

In [None]:
# drop outliers
from scipy import stats
dataFrame.loc[:,  'price'] = dataFrame.price.apply(lambda x: x[1:-3].replace(',', '')).astype(int)
dataFrame = dataFrame[dataFrame.price!=0]
dataFrame = dataFrame[(dataFrame.number_of_reviews>4)]
dataFrame = dataFrame[(np.abs(stats.zscore(dataFrame[['price']])) < 3)]

In [None]:
len(dataFrame)

In [None]:
#text pipeline steps
from sklearn.base import BaseEstimator, TransformerMixin

#Esta clase simplemente filtra las columnas que se le indica en el constructor
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ]
    
class TextTransformer(BaseEstimator, TransformerMixin):
    
    def __clean_text(self, x):
        for punct in "/-'":
            x = x.replace(punct, ' ')
        for punct in '&':
            x = x.replace(punct, f' {punct} ')
        for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~•' + '“”’':
            x = x.replace(punct, '')
        return x.lower()
    
    def __parseTextCols(self, x):
        finalTexts = []
        for i in x:
            if(pd.notna(i) and i not in finalTexts):
                finalTexts.append(i)
        text = self.__clean_text(' '.join(finalTexts))
        return text
    
    def fit(self, X, y = None):
        return self
    
    def transform (self, X, y = None):
        return X.apply(self.__parseTextCols, axis=1)
    
    
from sklearn.feature_extraction.text import TfidfVectorizer
class custom_Tfidf(TfidfVectorizer, TransformerMixin):
    options= {
        'fitSample': 1
    }
    def __init__(self, params, options = None):
        self.vectorizer = TfidfVectorizer(**params)
        if(options != None):
            for key in options.keys():
                self.options[key] = options[key]
        
    def fit(self, X, y = None):
        self.vectorizer.fit(X.sample(frac=self.options['fitSample']))
        return self
    
    def transform(self, X, y = None):
        return self.vectorizer.transform(X)

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

text_cols = ["name","summary","space","description","neighborhood_overview","notes","transit","access",
             "interaction","house_rules","host_about"]

from nltk.corpus import stopwords
stopwords_en = stopwords.words('english')
textVectSettings = {
    'stop_words': stopwords_en,
    'max_df': 0.95,
    'min_df': .05,
    'ngram_range': (1,2),
    'max_features': 300
}

#Pasos para el pipeline Textos
text_pipeline = Pipeline(steps = [
    ( 'text_selector', FeatureSelector(text_cols) ),
    ( 'text_transformer', TextTransformer() ),
    ( 'text_vectorize',  custom_Tfidf(textVectSettings, {'fitSample':1}))
] )

In [None]:
dummy_cols = ["instant_bookable","is_business_travel_ready","cancellation_policy",
"require_guest_phone_verification",
"require_guest_profile_picture","host_response_time",
"host_is_superhost","host_has_profile_pic","host_identity_verified",
"city","state","property_type","room_type","bed_type"];

from sklearn.impute import SimpleImputer

dummy_pipeline = Pipeline(steps = [
    ('dummy_selector', FeatureSelector(dummy_cols)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
import numpy as np

class NumericalImputer(BaseEstimator, TransformerMixin):
    def __init__( self, default_strategy = "median"):
        self._default_strategy = default_strategy
        self._default_values = {}
        
    def fit( self, X, y = None ):
        X.host_response_rate = X.host_response_rate.str.replace('%', '').astype(float)
        X.host_acceptance_rate = X.host_acceptance_rate.str.replace('%', '').astype(float)
        
        #Si hay valores infinitos los convertimos en NaN
        X = X.replace( [ np.inf, -np.inf ], np.nan )
        
        for col in X.columns:
            if col=='number_of_reviews_ltm':
                default_value=0;
            elif col=='number_of_reviews':
                default_value=0;
            elif col=='host_listings_count':
                default_value=1;
            elif self._default_strategy=='median':
                default_value=np.median(X[col].dropna())
            elif self._default_strategy=='mode':
                default_value=np.mode(X[col].dropna())
            elif self._default_strategy=='mean':
                default_value=np.mean(X[col].dropna())
            else:
                default_value=np.median(X[col].dropna())
            self._default_values[col]=default_value

        return self 
    
    def transform(self, X, y = None):
        X.host_response_rate = X.host_response_rate.astype(str).str.replace('%', '').astype(float)
        X.host_acceptance_rate = X.host_acceptance_rate.astype(str).str.replace('%', '').astype(float)
        
        for col in X.columns:
            X[col] = X[col].astype(float)
            #Si hay valores infinitos los convertimos en NaN
            X[col] = X[col].replace( [ np.inf, -np.inf ], np.nan)
            X[col].fillna(self._default_values[col],inplace=True)
        return X

class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__( self, log_transform = True):
        self._log_transform = log_transform
        
    def fit( self, X, y = None ):
        return self 
    
    def transform(self, X, y = None):
        
        if self._log_transform:
            for col in X.columns:
                colname = col+"_log"
                X.loc[:,colname] = np.log(X[col]+1)
                
        #Retornamos un array de Numpy ?
        return X
    
class NumericalAddFeatures(BaseEstimator, TransformerMixin):
    def __init__( self, bath_per_bed = True, bath_per_bedroom = True ):
        self._bath_per_bed = bath_per_bed
        self._bath_per_bedroom = bath_per_bedroom
        
    def fit( self, X, y = None ):
        return self 
    
    def transform(self, X, y = None):
        if self._bath_per_bedroom:
            X.loc[X['bedrooms']==0,'bedrooms']=1;
            X['bath_per_bedroom'] = X['bathrooms'] / X['bedrooms']
        if self._bath_per_bed:
            X.loc[X['beds']==0,'beds']=1;
            X['bath_per_bed'] = X['bathrooms'] / X['beds']
        
        # ejemplo para clasificar valores
        #pd.cut(df.Age,bins=[0,2,17,65,99],labels=['Toddler/Baby','Child','Adult','Elderly'])
        
        #Retornamos un array de Numpy ?
        return X

In [None]:
continuos_cols = [
    "reviews_per_month","host_response_rate","host_acceptance_rate",
    "review_scores_communication","review_scores_location","review_scores_value",
    "number_of_reviews_ltm","review_scores_rating","review_scores_cleanliness",
    "review_scores_checkin","availability_30","availability_60","availability_90","availability_365",
    "review_scores_accuracy","minimum_nights","maximum_nights",
    "calculated_host_listings_count","calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms",
    "host_listings_count","number_of_reviews",
    "accommodates","bathrooms","bedrooms","beds","guests_included",
]


numerical_pipeline = Pipeline( steps = [
    ( 'num_selector', FeatureSelector(continuos_cols) ),
    ( 'num_imputer', NumericalImputer(default_strategy = 'median') ),
    ( 'num_transformer', NumericalTransformer() ),
    ( 'std_scaler', StandardScaler() ) 
])

In [None]:


full_pipeline = FeatureUnion( transformer_list = [ 
    ('numerical_pipeline', numerical_pipeline ),
    ('dummy_pipeline', dummy_pipeline ),
    ('text_pipeline', text_pipeline ),
] )

In [None]:
from sklearn.model_selection import train_test_split

# y = dataFrame.price.apply(lambda x: x[1:-3].replace(',', '')).astype(int)
y = dataFrame.price
X = dataFrame.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.head()

# full_pipeline.fit(dataFrame)

In [None]:
full_pipeline.fit(X_train)

In [None]:
train_data = full_pipeline.transform(X_train)
train_targets = y_train

In [None]:
train_data.shape

In [None]:
validation_data = full_pipeline.transform(X_test)
validation_targets = y_test

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.layers import Dropout
from keras import optimizers
from keras import backend as K
from keras.layers import BatchNormalization, Activation

def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model = Sequential()
# model.add(Dense(1500, activation='relu', input_shape=(train_data.shape[1],)))
# model.add(Dropout(0.5))
# model.add(Dense(1500, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1500, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1500, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1500, activation='relu'))
# model.add(Dropout(0.5))


model.add(Dense(1000, input_shape=(train_data.shape[1],), kernel_regularizer=regularizers.l2(0.003)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.33))

model.add(Dense(1000))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(800))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(700))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(600))
model.add(Activation('relu'))

model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_logarithmic_error', metrics=['mae', r2_keras])
# mean_squared_logarithmic_error
# mean_squared_error
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=4, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=20, restore_best_weights=True, verbose=1)
callbacks_list = [early_stopping, reduce_lr]

model.optimizer.learning_rate.assign(0.001)

history = model.fit(
    train_data,
    train_targets,
    epochs=100,
    batch_size=512,
    verbose=1,
    callbacks=callbacks_list,
    validation_split=.25
)

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
plt.title('LOSS')
plt.plot(epochs, loss)
plt.ylim((0, .5))
plt.plot(epochs, val_loss)
plt.xticks(ticks=epochs)
plt.ylabel('Loss')
plt.legend(['Training loss', 'Validation loss'])
plt.show()

mae = history.history['mae']
val_mae = history.history['val_mae']
plt.figure(figsize=(12,8))
plt.title('MAE')
plt.plot(epochs, mae)
plt.plot(epochs, val_mae)
plt.xticks(ticks=epochs)
plt.ylabel('Mae')
plt.legend(['Training mae', 'Validation mae'])
plt.show()


acc = history.history['r2_keras']
val_acc = history.history['val_r2_keras']
plt.figure(figsize=(12,8))
plt.title('R2')
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.xticks(ticks=epochs)
plt.ylabel('R2')
plt.ylim((.4, None))
plt.legend(['Training r2', 'Validation r2'])
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.hist(train_targets, bins=20, range=(0, 700))
plt.show()

plt.figure(figsize=(12,5))
plt.hist(validation_targets, bins=20, range=(0, 700))
plt.show()

In [None]:
history.history['val_r2_keras'][-1]
# 66.35355
# 66.218636
# 59.322807
# 65.16372
# 68.29044
# 50.142715

# 0.6787062883377075
# 0.6495150923728943
# 0.6714338064193726
# 0.6882

In [None]:
y_predicted = model.predict(train_data)
# y_predicted = np.exp(y_predicted_log)

y_test_predicted = model.predict(validation_data)

# validation_data = full_pipeline.transform(X_test)
# validation_targets = y_test

In [None]:
lims = (0, 500)
alpha = .01

plt.figure(figsize=(12,8))
plt.scatter(train_targets, y_predicted, alpha=alpha)
plt.scatter(train_targets, train_targets, alpha=alpha)
plt.xlabel('Valor real por noche')
plt.ylabel('Valor predicho por noche')
plt.xlim(lims)
plt.ylim(lims)
plt.show()


plt.figure(figsize=(12,8))
plt.scatter(validation_targets, y_test_predicted[:,0], alpha=alpha)
plt.scatter(validation_targets, validation_targets, alpha=alpha)
plt.xlabel('Valor real por noche')
plt.ylabel('Valor predicho por noche')
plt.xlim(lims)
plt.ylim(lims)
plt.show()

In [None]:
df_predicted = X_train.copy()

df_predicted.loc[:, 'price'] = train_targets
df_predicted.loc[:, 'predicted'] = y_predicted[:,0]
df_predicted.loc[:, 'pred_ratio'] = y_predicted[:,0] / train_targets

df_predicted[df_predicted.price>1000][['price', 'predicted', 'pred_ratio', 'listing_url', 'name']].sort_values('pred_ratio')
# df_predicted