In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import json
from pathlib import Path  

In [2]:
#Reading Data
df_listings = pd.read_csv('data/listings.csv')
df_calendar = pd.read_csv('data/calendar.csv')
df_reviews = pd.read_csv('data/reviews.csv')

In [3]:
print("There are %s listings in the listings.csv" % df_listings.shape[0])

There are 3818 listings in the listings.csv


In [4]:
def remove_dollar_sign(x):
    if (type(x) == str):
        x = x.replace('$','')
        x = x.replace(',','')
        return float(x)
    return (x)
def transform_percent(x):
    if (type(x) == str):
        x = x.replace('%','')
        return float(x)/100
    return (x)
def transform_boolean(x):
    if (type(x) == str):
        if (x == "f"): 
            return float(0)
        elif (x == "t"):
            return float(1)

fill_mean = lambda col: col.fillna(col.mean())

In [5]:
# correlation between features
#prepare the data

df_listings['price']=df_listings['price'].map(lambda x: remove_dollar_sign(x))
df_listings['weekly_price']=df_listings['weekly_price'].map(lambda x: remove_dollar_sign(x))
df_listings['monthly_price']=df_listings['monthly_price'].map(lambda x: remove_dollar_sign(x))
df_listings['security_deposit']=df_listings['security_deposit'].map(lambda x: remove_dollar_sign(x))

df_listings['host_response_rate']=df_listings['host_response_rate'].map(lambda x: transform_percent(x))
df_listings['host_acceptance_rate']=df_listings['host_acceptance_rate'].map(lambda x: transform_percent(x))
df_listings['host_is_superhost'] = df_listings['host_is_superhost'].map(lambda x: transform_boolean(x))

df_listings_numeric_only = df_listings.select_dtypes(include=[np.number])
df_listings_numeric_only= df_listings_numeric_only.drop(['license','latitude','longitude','id','scrape_id','host_id'],axis=1)
df_listings_numeric_only.to_csv(Path('out.csv'))
df_listings_numeric_only = df_listings_numeric_only.apply(fill_mean,axis=0)
df_listings_numeric_only.to_csv(Path('out2.csv'))

corr = df_listings_numeric_only.corr()
#A=corr.unstack().sort_values(ascending=False)
df_listings_numeric_only.head()
#print('The correlation of the  against all:\n', A.price[1:-1])

#How does the number of ammenities influence the price
res = df_listings.amenities.apply(lambda s: s.replace('{','').replace('}','').replace('"',''))
amenities_set = set()
for elem in res:
    amenities_set.update(elem.split(','))
amenities_set=[x for x in amenities_set if x]

dictionary = {key:list() for key in amenities_set}
print(dictionary.keys())
for all_amenities in res:
    has_amenity = []
    for elem in all_amenities.split(','):
        if (elem and not dictionary.get(elem)):
            dictionary.update({elem:[True]})
        else:
            if elem: 
                dictionary.get(elem).append(True)
        has_amenity.append(elem)
    #print(has_amenity)
    for key in dictionary.keys():
        if key not in has_amenity:
            dictionary.get(key).append(False)
    
df_amenities = pd.DataFrame(dictionary)
result = pd.concat([df_amenities, df_listings_numeric_only], axis=1, join='inner')
result.to_csv(Path('out3.csv'))
#A=result.corr().unstack().sort_values(ascending=False)
#print('The correlation of the  against all:\n', A.price[0:].to_string())
#result.head()
#for key,value in dictionary.items():
#    print("%d %s" % (len(value),key))
    #print(value)
#c = a.join(b)
#print(b)
#df_listings.info()
#c.head()

dict_keys(['Cat(s)', 'Hair Dryer', 'Smoke Detector', 'Wireless Internet', 'Other pet(s)', 'Dog(s)', 'Smoking Allowed', 'Heating', '24-Hour Check-in', 'Family/Kid Friendly', 'Safety Card', 'Internet', 'Gym', 'Breakfast', 'Indoor Fireplace', 'Fire Extinguisher', 'Wheelchair Accessible', 'Iron', 'Pool', 'First Aid Kit', 'Laptop Friendly Workspace', 'Doorman', 'Carbon Monoxide Detector', 'Shampoo', 'Dryer', 'Buzzer/Wireless Intercom', 'Free Parking on Premises', 'Hangers', 'Suitable for Events', 'Pets Allowed', 'Washer / Dryer', 'Hot Tub', 'Pets live on this property', 'Kitchen', 'Elevator in Building', 'TV', 'Lock on Bedroom Door', 'Essentials', 'Air Conditioning', 'Washer', 'Cable TV'])


In [6]:
y = result.price
X= result.drop('price',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)
X_train.shape

lr_model = LinearRegression()
lr_model.fit(X_train, y_train) 

y_test_pred = lr_model.predict(X_test)
y_train_pred = lr_model.predict(X_train)

# Print model evaluation scores for training and test
print('Traning Data:')
print('r-sqaured score: %.3f' % r2_score(y_train, y_train_pred))
print('mean-sqaured error: %.3f' % mean_squared_error(y_train, y_train_pred))
print('---')
print('Test Data:')
print('r-sqaured score: %.3f' % r2_score(y_test, y_test_pred))
print('mean-sqaured error: %.3f' % mean_squared_error(y_test, y_test_pred))
print('---')

Traning Data:
r-sqaured score: 0.631
mean-sqaured error: 2937.751
---
Test Data:
r-sqaured score: -498.276
mean-sqaured error: 4275574.754
---
