In [None]:
import pandas as pd 
import numpy as np 

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import sklearn.metrics as metrics
from collections import Counter


import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math
import pylab
import scipy.stats as stats


In [None]:
#load data into a pandas dataframe
df = pd.read_csv("data/listings.csv") 

In [None]:
#get a list of columns in the data frame 
df.columns

In [None]:
#drop columns that have too many NaN values and columns that will not be used to determine the price of an airbnb
df_cleaned = df.filter(items = ['id',
       'space', 'experiences_offered', 'host_since', 'host_is_superhost', 
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'latitude', 'longitude',
       'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type',  'square_feet',
       'price', 'weekly_price', 'monthly_price', 'security_deposit',
       'guests_included', 'last_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'instant_bookable',
       'is_business_travel_ready', 'cancellation_policy']).copy()

In [None]:
#quick look at the data and remaining columns
df_cleaned.dtypes

In [None]:
df_cleaned.columns

In [None]:
df_cleaned.head()

In [None]:
df_cleaned = df_cleaned.filter(['id', 'experiences_offered','host_is_superhost',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price',
       'monthly_price', 'security_deposit', 'guests_included', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'is_business_travel_ready',
       'cancellation_policy'])

In [None]:
#make dummies of categorical columns
#columns for dummies =  host_response_time, host_is_superhost, room_type, cancellation_policy

host_response_time_dummies = pd.get_dummies(df_cleaned.host_response_time)
host_is_superhost_dummies = pd.get_dummies(df_cleaned.host_is_superhost)
room_type_dummies = pd.get_dummies(df_cleaned.room_type)
cancellation_policy_dummies = pd.get_dummies(df_cleaned.cancellation_policy)
experiences_offered_dummies = pd.get_dummies(df_cleaned.experiences_offered)

In [None]:
#quick look at the new dummies columns 
cancellation_policy_dummies.head()

In [None]:
#take a look at experiences_offered_dummies
experiences_offered_dummies.head()

In [None]:
#change price column to float for future use

df_cleaned['price'] = (df_cleaned['price'].str.replace(r'[^-+\d.]', '').astype(float))


In [None]:
df_cleaned['price'].isnull().sum()

In [None]:
#list of remaining columns

df_cleaned.columns

In [None]:
#drop more columns that are not needed

df_cleansed = df_cleaned.drop(columns = [ 'host_response_time', 'host_response_rate',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'street', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city',
       'state', 'country_code', 'country', 
         'amenities',
       'monthly_price', 'security_deposit', 'cleaning_fee', 
       'first_review', 'last_review', 'review_scores_rating'])




In [None]:
df_cleansed.columns

In [None]:
df_cleansed.isnull().sum()

In [None]:
#drop square feet and weekly_price columns because they have too many null/NaN values 

df_cleansed = df_cleansed.drop(columns = ["square_feet", "weekly_price"])

In [None]:
#quick look at remianing columns in df
df_cleansed.head()

In [None]:
len(df_cleansed['zipcode'])

In [None]:
#data visualization: plot a histogram for the 'accommodates' column

plt.hist(df_cleansed['accommodates'], bins=50)
plt.title("Histogram of Accommodations")
plt.xlabel("Number of Accommodations")
plt.ylabel("Frequency")
plt.show()

In [None]:
#determine how many beds are available 

print ('Number of Unique Beds: ', np.unique(df_cleansed['beds']))
for i in range(1, 17):
    print ('Beds {}:'.format(i), len(df_cleansed[df_cleansed['beds'] == i]))



In [None]:
#make dummies for beds 
beds_dummies = pd.get_dummies(df_cleansed.beds)

In [None]:
#quick look at beds_dummies 
beds_dummies.head()

In [None]:
#make histogram of the beds column

plt.hist(df_cleansed['beds'], bins=50)
plt.title("Histogram of beds")
plt.xlabel("Number of beds")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_cleansed = df_cleansed['zipcode'].dropna()

In [None]:
df_cleansed.dtypes()

In [None]:
#convert more categorical columns into dummies 

zipcode_dummies = pd.get_dummies(df_cleansed['zipcode'])
property_dummies = pd.get_dummies(df_cleansed['property_type'])
room_dummies = pd.get_dummies(df_cleansed['room_type'])
bed_dummies = pd.get_dummies(df_cleansed['bed_type'])

In [None]:
#quick look at the remainig dataset
df_cleansed.head()

In [None]:
#quick look at the zicode_dummies columns
zipcode_dummies.head()


In [None]:
zipcode_dummies.isna().sum()
#showing up as ints and floats, need to change all to int

In [None]:
df['zipcode'].astype(str)


In [None]:
#find null/NaN values in zipcode 
print ("zipcode null values:" ,df_cleansed['zipcode'].isnull().sum())
print ("null values in df:", df_cleansed.isnull().sum())

In [None]:
#drop any rows with null/NaN values 
df_cleansed.dropna(axis=0, how='any', subset=['zipcode', 'bedrooms', 'bathrooms'], inplace=True)

In [None]:
df_cleansed.isnull().sum()

In [None]:
df_cleansed['zipcode'].astype(int)

In [None]:
df_cleansed = df_cleansed.drop(columns = ["zipcode"])


In [None]:
df_cleansed.isnull().sum()

In [None]:

property_dummies.isna().sum()

In [None]:

room_dummies.isna().sum()

In [None]:

bed_dummies.isna().sum()

In [None]:

#zipcode_dummies=zipcode_dummies.reset_index(drop=True, inplace=True)
property_dummies=property_dummies.reset_index(drop=True, inplace=True)
room_dummies=room_dummies.reset_index(drop=True, inplace=True)
bed_dummies=bed_dummies.reset_index(drop=True, inplace=True)


In [None]:

dummies_df = pd.concat((property_dummies, room_dummies, bed_dummies,))

In [None]:

dummies_df.isnull().sum()

In [None]:
df_dummies_cleansed = pd.concat([df_cleansed.drop('property_type', axis=1), pd.get_dummies(df_cleansed['property_type'])], axis=1)


In [None]:
df_dummies_cleansed = pd.concat([df_cleansed.drop('room_type', axis=1), pd.get_dummies(df_cleansed['room_type'])], axis=1)


In [None]:
df_dummies_cleansed = pd.concat([df_cleansed.drop('beds', axis=1), pd.get_dummies(df_cleansed['beds'])], axis=1)


In [None]:
df_dummies_cleansed = pd.concat([df_cleansed.drop('accommodates', axis=1), pd.get_dummies(df_cleansed['accommodates'])], axis=1)


In [None]:

df_dummies_cleansed = pd.concat([df_cleansed.drop(columns = ['cancellation_policy'], axis=1), pd.get_dummies(df_cleansed['cancellation_policy'])], axis=1)


In [None]:
df_dummies_cleansed.isnull().sum()

In [None]:
alldata = pd.concat((df.drop(['property_type','room_type', 'beds', 'accommodates', 'cancellation_policy'], axis=1), property_dummies.astype(str).astype(int), room_type_dummies.astype(str).astype(int), beds.astype(str).astype(int), accommodates_dummies.astype(str).astype(int), cancellation_policy_dummies.astype(str).astype(int)), axis=1)
allcols = alldata.columns