In [None]:
# Importing Modules for data processing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

%matplotlib inline

In [None]:
# Set the path of dataframes
data_path = r'./Munich/' 

# loading the dataframes
calendar_df = pd.read_csv(data_path + 'calendar.csv')
listings_df = pd.read_csv(data_path + 'listings.csv')
reviews_df  = pd.read_csv(data_path + 'reviews.csv')

# size of dataframes
print("{:<20} {:>10} {:<5} {:>3} {:>5}".format('calendar_df has',"{:,.0f}".format(calendar_df.shape[0]),' rows and ',"{:,.0f}".format(calendar_df.shape[1]),' columns.'))
print("{:<20} {:>10} {:<5} {:>3} {:>5}".format('listings_df has',"{:,.0f}".format(listings_df.shape[0]),' rows and ',"{:,.0f}".format(listings_df.shape[1]),' columns.'))
print("{:<20} {:>10} {:<5} {:>3} {:>5}".format('reviews_df has',"{:,.0f}".format(reviews_df.shape[0]),' rows and ',"{:,.0f}".format(reviews_df.shape[1]),' columns.'))

In [None]:
# Converting strings in the dataframe to floats and booleans
# converting the price in calendar dataframe to float numbers in the dataframe
calendar_df['price'] = calendar_df['price'].apply(lambda x: float(str(x).replace('$', '').replace(',', '')))

# converting the string available to a boolean value in the dataframe
calendar_df['available'].replace('f', False, inplace=True)
calendar_df['available'].replace('t', True, inplace=True)

In [None]:
calendar_df.head(5)

In [None]:
# Calculating the unique number of objects in the calendar dataframe
print('The number of objects in calendar dataframe is',
      "{:,.0f}".format(calendar_df['listing_id'].nunique()))

In [None]:
print('The maximum price of objects is',"{:,.2f}".format(calendar_df['price'].max()),'$')
print('The minimum price of objects is',"{:,.2f}".format(calendar_df['price'].min()),'$')
print('The average price of objects is',"{:,.2f}".format(calendar_df['price'].mean()),'$')
print('The maximum price of objects is',"{:,.2f}".format(calendar_df['price'][calendar_df['price'].between(10, 400)].mean()),'$')

In [None]:
print('The number of objects without ouliers in the calendar dataframe is',
      "{:,.0f}".format(calendar_df['listing_id'][calendar_df['price'].between(10, 400)].nunique()))

In [None]:
price = calendar_df.groupby('listing_id').mean()['price'].reset_index()

fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
price['price'][price['price'].between(10, 400)].hist(bins=50)
plt.ylabel('Number of objects')
plt.xlabel('Price of available objects')
plt.title('Number of object vs. price of objects');

In [None]:
# converting the price value in the listings dataframe to float number
listings_df['price'] = listings_df['price'].apply(lambda x: float(str(x).replace('$', '').replace(',', '')))
listings_df['instant_bookable'].replace('t', True, inplace=True)
listings_df['instant_bookable'].replace('f', False, inplace=True)
listings_df['instant_bookable'].replace('t', True, inplace=True)

listings_df.head(5)

In [None]:
reviews_df.head(5)

In [None]:
print('The average number available objects',"{:,.2f}".format(100*calendar_df['available'].mean()),'%')

fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
calendar_df.groupby('listing_id').mean()['available'].hist(bins=20)
plt.ylabel('Number of objects')
plt.xlabel('Average number of available objects')
plt.title('Number of object vs. ratio of available objects');

In [None]:
rooms_count = listings_df.groupby('id').mean()['bedrooms'].reset_index()

fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
rooms_count['bedrooms'][rooms_count['bedrooms'].between(0,5)].hist(bins=10)
plt.ylabel('Count of objects')
plt.xlabel('Number of rooms in object')
plt.title('Number of objects vs. number of rooms in object');

In [None]:
beds_count = listings_df.groupby('id').mean()['beds'].reset_index()

fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
beds_count['beds'][beds_count['beds'] < 10].hist(bins=10)
plt.xlabel('Number of beds in object')
plt.ylabel('Number of objects')
plt.title('Number of objects vs. number of beds in object');

In [None]:
# comments_count = listings_df.groupby('id').mean()['reviews_per_month'].reset_index()

# fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
# comments_count['reviews_per_month'][comments_count['reviews_per_month'] < 10].hist(bins=10)
# plt.xlabel('Number of reviews per month in object')
# plt.ylabel('Number of objects')
# plt.title('Number of reviews per objects vs. number of reviews per month in object');

In [None]:
reviews_per_month = listings_df.groupby('id').mean()['reviews_per_month'].reset_index()
fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
reviews_per_month['reviews_per_month'][reviews_per_month['reviews_per_month'].between(1,5)].hist(bins=5)

plt.xlabel('Number of reviews')
plt.ylabel('Number of objects')
plt.title('Number of reviews per objects');

In [None]:
reviews_per_month['id'].nunique()

In [None]:
review_score_rating = listings_df.groupby('id').mean()['review_scores_rating'].reset_index()
fig,ax = plt.subplots(nrows=1, sharex=True,figsize=(8, 3))
review_score_rating['review_scores_rating'][review_score_rating['review_scores_rating'] < 5].hist(bins=25)
plt.xlabel('Review score rating')
plt.ylabel('Number of objects')
plt.title('Review score rating');

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from time import time
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.neural_network import MLPRegressor

# dataframe with rows for nan are removed
listings_df = listings_df.dropna(subset=['bedrooms'], axis=0)[listings_df['bedrooms'] > 0]\
    [listings_df['beds'] > 0][listings_df['review_scores_rating'] > 0]

X = listings_df[['beds','bedrooms','review_scores_rating']]
Y = listings_df['price']

print("Training MLPRegressor...")
tic = time()
# Split data into training and test data, and fit a linear model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y , test_size=.10)
model = make_pipeline(QuantileTransformer(), MLPRegressor(hidden_layer_sizes=(100,100,100), 
                                                          learning_rate_init=0.01, 
                                                          early_stopping=True,
                                                          max_iter=100000,
                                                          tol=0.0001,
                                                          epsilon=1e-08),
)

# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
model.fit(X_train, Y_train)    
print(f"done in {time() - tic:.3f}s")
    
Y_test_preds = model.predict(X_test)  # Predictions here

# calculate rsquared and mse
rsquared_train = model.score(X_train, Y_train)
rsquared_test = model.score(X_test, Y_test)
# print(rsquared)
print('Test R2 score of training dataset',"{:,.2f}".format(100*rsquared_train),'%')
print('Test R2 score of testing dataset',"{:,.2f}".format(100*rsquared_test),'%')


In [None]:
listings_df.info()