# Part 4 - Dimensionality Reduction
To ready the data for analysis, we will seek to reduce the number of fields to a more manageable number, before we apply data science techniques on it. The technique we will use is Principal Component Analysis (PCA), with the approach of grouping up fields in similar catagories.

In [186]:
%matplotlib inline
import pandas as pd
import numpy as np

# Data sourced from Inside Airbnb (http://insideairbnb.com/get-the-data.html)
# A million thanks to Murray Cox
abnb_listings = pd.read_csv("airbnb-sf-data/listings2.csv")

## Full list of available fields 
# calendar = pd.read_csv("airbnb-sf-data/calendar.csv")
# calendar2 = pd.read_csv("airbnb-sf-data/calendar2.csv")
# listings = pd.read_csv("airbnb-sf-data/listings.csv")
# listings2 = pd.read_csv("airbnb-sf-data/listings2.csv")
# nhoods = pd.read_csv("airbnb-sf-data/neighbourhoods.csv")
# reviews = pd.read_csv("airbnb-sf-data/reviews.csv")
# reviews2 = pd.read_csv("airbnb-sf-data/reviews2.csv")

The objective of this section is to reduce the dimensions of the listings data into the following fields:
1. ID, Lat and Long
2. Location
3. Space
4. Price
5. Description
6. Host
7. Rating
8. Reviews Per Month

In [187]:
list(abnb_listings.columns)

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',


## 1. ID, Lat, and Long

In [188]:
listingid = abnb_listings['id']
latlong = abnb_listings[['latitude','longitude']]

## 2. Location

In [189]:
locationfields = [
 'latitude','longitude',
 'neighbourhood_cleansed',
]

## there are other location-related fields omitted:
#'street', proxy with zipcode
# 'zipcode', too many levels
#'neighbourhood', proxy with neighbourhood_cleansed
#'neighbourhood_group_cleansed', is NaN
#'city','state', equivalent to smart_location 
#'market', equivalent to city
#'country', equivalent to country code
#'is_location_exact', omit
#'smart_location', provisions for multi-city sets, same for entire dataset
#'country_code', provisions for multi-country sets, same for entire dataset

location = abnb_listings[locationfields]

In [190]:
from sklearn.preprocessing import LabelEncoder
numerify = LabelEncoder()

location['neighbourhood_cleansed'] = numerify.fit_transform(location['neighbourhood_cleansed'].astype('str'))

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
location = pca.fit_transform(location)
location = [i[0] for i in location]
location = pd.DataFrame(pd.Series(location), columns=["location"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 3. Space

In [191]:
spacefields = [
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type'
]
space = abnb_listings[spacefields]
space['wireless'] = abnb_listings.amenities.apply(lambda x: 1 if "Wireless Internet" in x else 0)
space.room_type = space.room_type.map({
"Shared room": 0,
"Private room": 1,
"Entire home/apt":2
})
space.property_type = space.property_type.map({
"Other": 0,
"Dorm": 1,
"Bed & Breakfast": 1,
"Cabin": 1,
"Camper/RV": 1,
"Boat": 1,
"Yurt": 1,
"Tent": 1,
"Treehouse": 1,
"Plane": 1,
"Apartment": 2,
"Condominium": 2,
"Loft": 2,
"Townhouse": 2,
"House": 2,        
"Bungalow": 3,
"Villa": 3,
"Island": 4,
"Castle": 4,
})
space.bed_type = space.bed_type.map({
"Couch": 0,
"Airbed": 1,
"Futon": 2,
"Pull-out Sofa": 2,
"Real Bed": 3  
})
space.fillna(0, inplace=True)

space = pca.fit_transform(space)
space = [i[0] for i in space]
space = pd.DataFrame(pd.Series(space), columns=["space"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 4. Price

In [239]:
pricefields = [
 'price',
# 'weekly_price', # too many NaNs
# 'monthly_price', # too many NaNs
 'security_deposit',
 'cleaning_fee',
 'extra_people'
]
price = abnb_listings[pricefields]
price.fillna("$0", inplace=True)
for field in pricefields:
    price[field] = price[field].replace( '[\$,)]','', regex=True ).replace( '[(]','-',   regex=True ).astype(float)

# Do away with PCA and just normalize price for addon costs
# price = pca.fit_transform(price)
# price = [i[0] for i in price]
# price = pd.DataFrame(pd.Series(price), columns=["price"])
price = pd.DataFrame(price.price + (price.security_deposit / 5) + 
                     (price.cleaning_fee / 3) + (price.extra_people / 3), columns=["price"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 5. Description

In [242]:
descfields = [
 'name',
 'description',
 'neighborhood_overview',
 'notes',
 'transit'
]
description = abnb_listings[descfields].fillna("")
description =  description['name'] + " " + description['description'] + " " + description['neighborhood_overview'] \
+ " " + description['notes'] + " " + description['transit']
description = pd.DataFrame(description, columns=["description"])

## 6. Host

In [264]:
hostfields = [
#  'host_name',
#  'host_since',
#  'host_location',
#  'host_about',
#  'host_response_time',
#  'host_response_rate',
#  'host_acceptance_rate',
 'host_is_superhost',
#  'host_neighbourhood',
#  'host_listings_count',
#  'host_total_listings_count',
#  'host_verifications',
#  'host_has_profile_pic',
 'host_identity_verified'
]
host = abnb_listings[hostfields]
host.host_is_superhost = host.host_is_superhost.map({
"f": 0,
"t": 1
    })
host.host_identity_verified = host.host_identity_verified.map({
"f": 0,
"t": 1
    })
host = pd.DataFrame(host.host_is_superhost * 3 + host.host_identity_verified, columns=["host"])

## 7. Rating

In [305]:
ratingfields = [
'review_scores_rating', 
# 'review_scores_accuracy', # no need to overcomplicate things, just take the overall
# 'review_scores_cleanliness',
# 'review_scores_checkin',
# 'review_scores_communication',
# 'review_scores_location',
# 'review_scores_value'
]

rating = abnb_listings[ratingfields]
rating = rating.fillna(rating.mean())

# from sklearn.preprocessing import StandardScaler
# rating_std = StandardScaler().fit_transform(rating)
# rating = pd.DataFrame(pca.fit_transform(rating_std), columns=["rating"])

In [306]:
rating.head(10)

Unnamed: 0,review_scores_rating
0,89.0
1,93.777344
2,92.0
3,93.777344
4,100.0
5,94.0
6,92.0
7,100.0
8,94.0
9,97.0


## 8. Reviews Per Month

In [308]:
reviews_per_month = pd.DataFrame(abnb_listings["reviews_per_month"], columns=["reviews_per_month"])

In [310]:
reviews_per_month.head()
# not to fillna--merge and remove rows that do not have reviews per month

Unnamed: 0,reviews_per_month
0,0.28
1,
2,2.33
3,
4,0.36


# Concatenate Dataset

In [318]:
frames = [listingid, latlong, location, description, space, price, host, rating, reviews_per_month]
abnb_listings_redcd = pd.concat(frames, axis=1)

In [319]:
abnb_listings_redcd.head(10)

Unnamed: 0,id,latitude,longitude,location,description,space,price,host,review_scores_rating,reviews_per_month
0,1162609,37.785217,-122.488655,11.875956,Lovely One Bedroom Apartment This is a big wel...,-1.319244,386.666667,1,89.0,0.28
1,6032828,37.783658,-122.489398,11.875956,Historic Seacliff Home Beautiful charmng multi...,3.393851,300.0,0,93.777344,
2,6938818,37.781505,-122.504754,11.875956,Best Secret in Town Convenience is the key for...,-1.416978,142.333333,1,92.0,2.33
3,8087607,37.775318,-122.511621,11.875954,Single Room Beautiful Beach Condo When I trave...,-1.404158,79.0,1,93.777344,
4,4781448,37.781797,-122.492492,11.875955,3 Bd 2.5 Ba Full Flat Condo w Views Our family...,5.781405,940.0,1,100.0,0.36
5,1931937,37.781564,-122.494424,11.875955,Blocks from GG Park & Ocean Beach Large 2BD ap...,-1.404158,136.666667,1,94.0,1.13
6,2134100,37.783888,-122.508557,11.875958,Ocean Beach hilltop cottage Newly built cottag...,-1.582193,178.333333,1,92.0,6.09
7,4688930,37.782034,-122.494214,11.875955,Modern remodel + period charm A view of the Go...,-0.484353,241.666667,1,100.0,0.18
8,2316478,37.779118,-122.513065,11.875956,2+BR Beach Cottage-w/Parking+Views Experience ...,4.411968,348.333333,1,94.0,4.95
9,3168359,37.779319,-122.509635,11.875956,"Charming & Private Ideal location Spacious, qu...",1.268373,300.666667,1,97.0,3.94


# Write reduced dataset to a CSV file

In [320]:
abnb_listings_redcd.dropna().to_csv("abnb_listings_rdcd.csv")