In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind 

In [172]:
df = pd.read_csv('listings.csv', low_memory=False)

In [173]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,958,https://www.airbnb.com/rooms/958,20191204162709,2019-12-04,"Bright, Modern Garden Unit - 1BR/1B",New update: the house next door is under const...,"Newly remodeled, modern, and bright garden uni...",New update: the house next door is under const...,none,*Quiet cul de sac in friendly neighborhood *St...,...,f,f,moderate,f,f,1,1,0,0,1.79
1,3850,https://www.airbnb.com/rooms/3850,20191204162709,2019-12-04,Charming room for two,Your own private room plus access to a shared ...,This room can fit two people. Nobody else will...,Your own private room plus access to a shared ...,none,"This is a quiet, safe neighborhood on a substa...",...,f,f,strict_14_with_grace_period,f,f,3,0,3,0,1.38
2,5858,https://www.airbnb.com/rooms/5858,20191204162709,2019-12-05,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,none,I love how our neighborhood feels quiet but is...,...,f,f,strict_14_with_grace_period,f,f,1,1,0,0,0.86
3,7918,https://www.airbnb.com/rooms/7918,20191204162709,2019-12-04,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,"Settle down, S.F. resident, student, hospital,...",Nice and good public transportation. 7 minute...,none,"Shopping old town, restaurants, McDonald, Whol...",...,f,f,strict_14_with_grace_period,f,f,9,0,9,0,0.14
4,8142,https://www.airbnb.com/rooms/8142,20191204162709,2019-12-04,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,"Settle down, S.F. resident, student, hospital,...",Nice and good public transportation. 7 minute...,none,,...,f,f,strict_14_with_grace_period,f,f,9,0,9,0,0.13


In [174]:
df.shape #get the number of columns and rows in dataset

(8533, 106)

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8533 entries, 0 to 8532
Columns: 106 entries, id to reviews_per_month
dtypes: float64(23), int64(21), object(62)
memory usage: 6.9+ MB


In [176]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8533 entries, 0 to 8532
Data columns (total 106 columns):
id                                              int64
listing_url                                     object
scrape_id                                       int64
last_scraped                                    object
name                                            object
summary                                         object
space                                           object
description                                     object
experiences_offered                             object
neighborhood_overview                           object
notes                                           object
transit                                         object
access                                          object
interaction                                     object
house_rules                                     object
thumbnail_url                                   float64
medium_url   

Step 1: Drop obvious non-useful columns

In [177]:
df = df.drop(['scrape_id','last_scraped','thumbnail_url','medium_url','picture_url','xl_picture_url','host_name',
         'host_thumbnail_url','neighbourhood_group_cleansed'], axis=1)

In [178]:
df.shape #output: dropped 9 columns 

(8533, 97)

Step 2: Investigating NULL columns

In [179]:
df.dropna(how = 'any').shape #would result in these many rows if we drop all rows that has any null columns 

(0, 97)

In [180]:
df.columns[df.isna().any()].tolist() #selecting the columns with null values

['summary',
 'space',
 'description',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'city',
 'state',
 'zipcode',
 'market',
 'bathrooms',
 'bedrooms',
 'beds',
 'square_feet',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'license',
 'jurisdiction_names',
 'reviews_per_month']

In [181]:
len(df.columns[df.isna().any()].tolist())#counting how many columns have NULL values

47

In [182]:
cols_num_missing.sort_values(ascending = False).head(30) #checking the number of NULL values per column

thumbnail_url                   8533
neighbourhood_group_cleansed    8533
host_acceptance_rate            8533
xl_picture_url                  8533
medium_url                      8533
square_feet                     8407
monthly_price                   7507
weekly_price                    7480
notes                           3317
access                          3244
license                         3243
interaction                     2804
transit                         2606
house_rules                     2283
host_about                      2155
neighborhood_overview           2111
review_scores_value             1938
review_scores_location          1938
review_scores_checkin           1938
review_scores_cleanliness       1937
review_scores_accuracy          1937
review_scores_communication     1936
review_scores_rating            1932
reviews_per_month               1891
first_review                    1891
last_review                     1891
security_deposit                1759
s

In [183]:
#2.1) Drop the columns with many NULL values, there are 8533 rows in total 

### first drop the obvious one
df2 = df.drop(['host_acceptance_rate'], axis = 1)

In [184]:
df2.shape

(8533, 96)

In [185]:
### second drop the columns with majority null
null_perc = cols_num_missing/len(df2) 

In [186]:
null_perc

id                                              0.00000
listing_url                                     0.00000
scrape_id                                       0.00000
last_scraped                                    0.00000
name                                            0.00000
                                                 ...   
calculated_host_listings_count                  0.00000
calculated_host_listings_count_entire_homes     0.00000
calculated_host_listings_count_private_rooms    0.00000
calculated_host_listings_count_shared_rooms     0.00000
reviews_per_month                               0.22161
Length: 106, dtype: float64

In [187]:
for col in df2.loc[:, null_perc > .7]: #80% of data is null 
    print(col)

square_feet
weekly_price
monthly_price


In [188]:
mostly_null_cols = list(df.loc[:, null_perc > .8]) #70% of data is null

In [189]:
df2 = df.drop(columns = mostly_null_cols)

In [190]:
df2.shape

(8533, 93)

Step 3: Check validity of certain data points

In [191]:
#3.1: check validity of NULL zip codes 
df2[df2['zipcode'].isnull()]['neighbourhood'].value_counts()

SoMa                     50
Nob Hill                 35
Western Addition/NOPA    31
Downtown                 19
Mission District         10
Potrero Hill              9
Financial District        8
Russian Hill              8
The Castro                8
Dogpatch                  7
Pacific Heights           6
South Beach               6
Hayes Valley              6
Cole Valley               5
Richmond District         4
Bernal Heights            4
Duboce Triangle           3
Noe Valley                3
Lakeshore                 3
Bayview                   2
Civic Center              2
North Beach               2
Alamo Square              2
Telegraph Hill            2
Lower Haight              1
Chinatown                 1
Marina                    1
Haight-Ashbury            1
Parkside                  1
Crocker Amazon            1
Presidio Heights          1
Fisherman's Wharf         1
Outer Sunset              1
Twin Peaks                1
Name: neighbourhood, dtype: int64

In [192]:
pd.pivot_table(df2[['zipcode','neighbourhood','availability_365']],index=['neighbourhood','zipcode'])
#each neighborhood as multiple zip codes so didn't assign the null zipcodes zipcodes according to their neighborhood

Unnamed: 0_level_0,Unnamed: 1_level_0,availability_365
neighbourhood,zipcode,Unnamed: 2_level_1
Alamo Square,94115,121.333333
Alamo Square,94117,196.392157
Balboa Terrace,94112,199.115385
Balboa Terrace,94127,266.444444
Bayview,94110,365.000000
...,...,...
Western Addition/NOPA,94121,327.000000
Western Addition/NOPA,CA 94102,237.250000
Western Addition/NOPA,CA 94109,240.250000
Western Addition/NOPA,CA 94115,239.000000


In [193]:
#3.2: check validity of reviews count of 0
len(df2[df2['number_of_reviews'] == 0]) #1891 with 0 reviews 

1891

In [194]:
#for number_of_reviews = 0, check to see if the other review counts are also 0 
df2[df2['number_of_reviews'] == 0]['reviews_per_month'].value_counts() # null

Series([], Name: reviews_per_month, dtype: int64)

In [195]:
df2[df2['number_of_reviews'] == 0]['number_of_reviews_ltm'].value_counts() # null

0    1891
Name: number_of_reviews_ltm, dtype: int64

In [196]:
#removing rows with 0 number of reviews 
df3 = df2[df2['number_of_reviews'] != 0]

In [197]:
df3.shape

(6642, 93)

Step 4: Reformatting Columns

In [198]:
#formatting the numeric columns 
numeric_cols = ['price','security_deposit','cleaning_fee','extra_people']

In [199]:
#reformat zip_code

#strip out the $ in the numeric columns

for col in numeric_cols: 
    df3[col] = df3[col].str.lstrip('$')
    df3[col] = df3[col].str.replace(',','').astype(float)
#     df2[col] = df2[col].strip(',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [200]:
df3[numeric_cols].head(10)

Unnamed: 0,price,security_deposit,cleaning_fee,extra_people
0,170.0,100.0,100.0,25.0
1,99.0,0.0,10.0,20.0
2,235.0,,100.0,0.0
3,65.0,200.0,50.0,12.0
4,65.0,200.0,50.0,12.0
5,585.0,0.0,175.0,300.0
6,139.0,0.0,50.0,60.0
7,135.0,,50.0,0.0
8,199.0,500.0,100.0,0.0
9,120.0,500.0,75.0,0.0


In [201]:
#changing object to integers
df3[numeric_cols].apply(pd.to_numeric, errors = 'coerce') #making it into a numeric 

Unnamed: 0,price,security_deposit,cleaning_fee,extra_people
0,170.0,100.0,100.0,25.0
1,99.0,0.0,10.0,20.0
2,235.0,,100.0,0.0
3,65.0,200.0,50.0,12.0
4,65.0,200.0,50.0,12.0
...,...,...,...,...
8393,30.0,0.0,15.0,35.0
8426,99.0,750.0,60.0,0.0
8428,75.0,750.0,60.0,0.0
8436,100.0,200.0,80.0,50.0


In [202]:
df3[numeric_cols].dtypes

price               float64
security_deposit    float64
cleaning_fee        float64
extra_people        float64
dtype: object

Step 5: Checking the Security Deposit & Cleaning Fee

In [203]:
## mean of price with $0 security deposit
df3[['price']][df3['security_deposit'] == 0].mean()

price    179.402738
dtype: float64

In [204]:
## mean of price with > $0 security deposit
df3[['price']][df3['security_deposit'] != 0].mean()

price    228.860912
dtype: float64

In [205]:
## mean of price with NULL security despoit 
df3[['price']][df3['security_deposit'].isnull()].mean()

price    201.651163
dtype: float64

In [206]:
## mean of price with $0 cleaning fee
df3[['price']][df3['cleaning_fee'] == 0].mean()

price    199.472492
dtype: float64

In [207]:
## mean of price with > $0 security deposit
df3[['price']][df3['cleaning_fee'] != 0].mean()

price    216.604611
dtype: float64

In [208]:
## mean of price with NULL security despoit 
df3[['price']][df3['cleaning_fee'].isnull()].mean()

price    214.332117
dtype: float64

Security Deposit

In [209]:
## testing $0 & NULL 
ttest_ind(df3[['price']][df3['security_deposit'] == 0], df3[['price']][df3['security_deposit'].isnull()])

Ttest_indResult(statistic=array([-2.20873034]), pvalue=array([0.02727218]))

In [210]:
# testing $0 & NonZero
ttest_ind(df3[['price']][df3['security_deposit'] == 0],df3[['price']][df3['security_deposit'] != 0])

Ttest_indResult(statistic=array([-5.89242532]), pvalue=array([3.9920435e-09]))

In [211]:
# testing NonZero & NULL
ttest_ind(df3[['price']][df3['security_deposit'].isnull()],df3[['price']][df3['security_deposit'] != 0])

Ttest_indResult(statistic=array([-2.52360744]), pvalue=array([0.01164134]))

Cleaning Fee

In [212]:
## testing $0 & NULL 
ttest_ind(df3[['price']][df3['cleaning_fee'] == 0], df3[['price']][df3['cleaning_fee'].isnull()])

Ttest_indResult(statistic=array([-0.51296771]), pvalue=array([0.60810641]))

In [213]:
# testing $0 & NonZero
ttest_ind(df3[['price']][df3['cleaning_fee'] == 0],df3[['price']][df3['cleaning_fee'] != 0])

Ttest_indResult(statistic=array([-0.97285594]), pvalue=array([0.33066034]))

In [214]:
# testing NonZero & NULL
ttest_ind(df3[['price']][df3['cleaning_fee'].isnull()],df3[['price']][df3['cleaning_fee'] != 0])

Ttest_indResult(statistic=array([-0.15878713]), pvalue=array([0.87384125]))

In [215]:
df3.shape

(6642, 93)

In [216]:
df3.head()

Unnamed: 0,id,listing_url,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,958,https://www.airbnb.com/rooms/958,"Bright, Modern Garden Unit - 1BR/1B",New update: the house next door is under const...,"Newly remodeled, modern, and bright garden uni...",New update: the house next door is under const...,none,*Quiet cul de sac in friendly neighborhood *St...,Due to the fact that we have children and a do...,*Public Transportation is 1/2 block away. *Ce...,...,f,f,moderate,f,f,1,1,0,0,1.79
1,3850,https://www.airbnb.com/rooms/3850,Charming room for two,Your own private room plus access to a shared ...,This room can fit two people. Nobody else will...,Your own private room plus access to a shared ...,none,"This is a quiet, safe neighborhood on a substa...",House Rule footnotes: 1.\tI don’t allow check ...,Public transit service to my house is outstand...,...,f,f,strict_14_with_grace_period,f,f,3,0,3,0,1.38
2,5858,https://www.airbnb.com/rooms/5858,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,none,I love how our neighborhood feels quiet but is...,All the furniture in the house was handmade so...,The train is two blocks away and you can stop ...,...,f,f,strict_14_with_grace_period,f,f,1,1,0,0,0.86
3,7918,https://www.airbnb.com/rooms/7918,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,"Settle down, S.F. resident, student, hospital,...",Nice and good public transportation. 7 minute...,none,"Shopping old town, restaurants, McDonald, Whol...",Wi-Fi signal in common areas. Large eat in k...,N Juda Muni and bus stop. Street parking.,...,f,f,strict_14_with_grace_period,f,f,9,0,9,0,0.14
4,8142,https://www.airbnb.com/rooms/8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,"Settle down, S.F. resident, student, hospital,...",Nice and good public transportation. 7 minute...,none,,Wi-Fi signal in common areas. Large eat in k...,"N Juda Muni, Bus and UCSF Shuttle. small shopp...",...,f,f,strict_14_with_grace_period,f,f,9,0,9,0,0.13
