# Import packages

In [1]:
# For basic data manipulation.
import pandas as pd
import numpy as np

# For regex'ing html tags out of description
import re

# For analyzing text.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from readability import Readability

# for converting strings of JSONs and lists back into their data type.
import ast

# Show all columns.
pd.set_option('display.max_columns', None)

# Read in search page webscrape.

In [2]:
search_df = pd.read_csv('scraped_listings BACKUP.csv')

In [3]:
search_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186808 entries, 0 to 186807
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0.9      186808 non-null  int64  
 1   listing_id        186808 non-null  int64  
 2   listing_url       186808 non-null  object 
 3   is_superhost      98256 non-null   object 
 4   rating            164670 non-null  object 
 5   n_reviews         150731 non-null  float64
 6   listing_city      186570 non-null  object 
 7   listing_title     186808 non-null  object 
 8   n_pictures        186808 non-null  float64
 9   room_type         186808 non-null  object 
 10  latitude          186808 non-null  float64
 11  longitude         186808 non-null  float64
 12  beds              185383 non-null  object 
 13  price             138116 non-null  object 
 14  discounted_price  48692 non-null   object 
 15  original_price    48692 non-null   object 
 16  price_qualifier   18

In [4]:
# Drop columns.
search_df.drop(columns=['Unnamed: 0.9','Unnamed: 0.8','Unnamed: 0.7','Unnamed: 0.6','Unnamed: 0.5','Unnamed: 0.4',
                        'Unnamed: 0.3','Unnamed: 0.2','Unnamed: 0.1','Unnamed: 0', 'price_qualifier'], inplace=True)
# Check info again.
search_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186808 entries, 0 to 186807
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   listing_id        186808 non-null  int64  
 1   listing_url       186808 non-null  object 
 2   is_superhost      98256 non-null   object 
 3   rating            164670 non-null  object 
 4   n_reviews         150731 non-null  float64
 5   listing_city      186570 non-null  object 
 6   listing_title     186808 non-null  object 
 7   n_pictures        186808 non-null  float64
 8   room_type         186808 non-null  object 
 9   latitude          186808 non-null  float64
 10  longitude         186808 non-null  float64
 11  beds              185383 non-null  object 
 12  price             138116 non-null  object 
 13  discounted_price  48692 non-null   object 
 14  original_price    48692 non-null   object 
 15  image_1           186808 non-null  object 
 16  image_2           18

In [5]:
# If superhost or premium listing, assume superhost. Else, False.
def superhost(x):
    if str(x) == 'Luxe' or str(x) == 'Plus' or str(x) == 'Superhost':
        return 1
    else:
        return 0

# If Plus or Luxe, then superhost. Else, False.
def premium_listing(x):
    if str(x) == 'Luxe' or str(x) == 'Plus':
        return 1
    else:
        return 0

In [6]:
# Create two variables: is_premium and is_superhost.
search_df['is_premium'] = search_df['is_superhost'].apply(premium_listing)
search_df['is_superhost'] = search_df['is_superhost'].apply(superhost)

# Sort to keep like variables by each other.
search_df = search_df[['listing_id', 'listing_url', 'is_superhost', 'is_premium', 'rating', 'n_reviews',
       'listing_city', 'listing_title', 'n_pictures', 'room_type', 'latitude','longitude', 'beds', 
       'price', 'discounted_price', 'original_price', 'image_1', 'image_2', 'image_3', 
       'image_4','image_5']]


In [7]:
def entire_home(x):
    if str(x) == 'entire_home':
        return 1
    else:
        return 0

def hotel_room(x):
    if str(x) == 'hotel_room':
        return 1
    else:
        return 0

def private_room(x):
    if str(x) == 'private_room':
        return 1
    else:
        return 0
    
def shared_room(x):
    if str(x) == 'shared_room':
        return 1
    else:
        return 0

In [8]:
search_df['entire_home'] = search_df['room_type'].apply(entire_home)
search_df['hotel_room'] = search_df['room_type'].apply(hotel_room)
search_df['private_room'] = search_df['room_type'].apply(private_room)
search_df['shared_room'] = search_df['room_type'].apply(shared_room)

search_df.drop(columns=['room_type'],inplace=True)

search_df = search_df[['listing_id', 'listing_url', 'is_superhost', 'is_premium', 'rating',
       'n_reviews', 'listing_city', 'listing_title', 'n_pictures', 'entire_home', 'hotel_room', 
       'private_room', 'shared_room','latitude', 'longitude', 'beds', 'price', 'discounted_price',
       'original_price', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5']]

In [9]:
def final_price(price, discounted_price):
    if '$' in str(price):
        return price.replace('$','').replace(',','')
    else:
        return discounted_price.replace('$','').replace(',','')
    
def discount(discounted_price, original_price):
    if '$' in str(discounted_price):
        
        original_price = original_price.replace('$','').replace(',','')
        discounted_price = discounted_price.replace('$','').replace(',','')
        
        return int(discounted_price)/int(original_price)
    else:
        return 0

In [10]:
search_df['price'] = search_df.apply(lambda x: final_price(x.price, x.discounted_price), axis=1)
search_df['perc_discount'] = search_df.apply(lambda x: discount(x.discounted_price, x.original_price), axis=1)

search_df = search_df[['listing_id', 'listing_url', 'is_superhost', 'is_premium', 'rating',
       'n_reviews', 'listing_city', 'listing_title', 'n_pictures', 'entire_home', 'hotel_room', 
       'private_room', 'shared_room','latitude', 'longitude', 'beds', 'price','perc_discount', 
        'image_1', 'image_2', 'image_3', 'image_4', 'image_5']]

search_df.head()

Unnamed: 0,listing_id,listing_url,is_superhost,is_premium,rating,n_reviews,listing_city,listing_title,n_pictures,entire_home,hotel_room,private_room,shared_room,latitude,longitude,beds,price,perc_discount,image_1,image_2,image_3,image_4,image_5
0,47924385,https://www.airbnb.com/rooms/47924385,1,0,4.86,207.0,Flagler,Home in Flagler,9.0,1,0,0,0,39.29294,-103.06429,1 king bed,77,0.0,https://a0.muscache.com/im/pictures/40a53028-5...,https://a0.muscache.com/im/pictures/66d3a0d5-b...,https://a0.muscache.com/im/pictures/a7339783-c...,https://a0.muscache.com/im/pictures/f8ea448c-d...,https://a0.muscache.com/im/pictures/1a2a5d6b-f...
1,12964075,https://www.airbnb.com/rooms/12964075,1,0,4.87,419.0,Flagler,Private room in Flagler,11.0,0,0,1,0,39.29492,-103.06372,1 queen bed,65,0.0,https://a0.muscache.com/im/pictures/b6b6703f-b...,https://a0.muscache.com/im/pictures/49b235b7-7...,https://a0.muscache.com/im/pictures/478c7fd6-7...,https://a0.muscache.com/im/pictures/1b6947f8-2...,https://a0.muscache.com/im/pictures/ce671a50-b...
2,50379619,https://www.airbnb.com/rooms/50379619,1,0,4.93,42.0,Bridgeport,Home in Bridgeport,15.0,1,0,0,0,41.665535,-103.095772,3 beds,75,0.882353,https://a0.muscache.com/im/pictures/9ee93475-6...,https://a0.muscache.com/im/pictures/b18ca7d5-0...,https://a0.muscache.com/im/pictures/ecc78c28-b...,https://a0.muscache.com/im/pictures/05172838-6...,https://a0.muscache.com/im/pictures/f30c7acb-2...
3,710231964358460529,https://www.airbnb.com/rooms/710231964358460529,0,0,,,Bridgeport,Private room in Bridgeport,5.0,0,0,1,0,41.66877,-103.10166,2 beds,40,0.0,https://a0.muscache.com/im/pictures/7a8d2d04-6...,https://a0.muscache.com/im/pictures/42c9020c-7...,https://a0.muscache.com/im/pictures/52bd72d4-d...,https://a0.muscache.com/im/pictures/35c9f58a-d...,https://a0.muscache.com/im/pictures/d39ad521-3...
4,723852070242986749,https://www.airbnb.com/rooms/723852070242986749,0,0,,,Bridgeport,Private room in Bridgeport,7.0,0,0,1,0,41.67005,-103.10162,2 beds,43,0.0,https://a0.muscache.com/im/pictures/miso/Hosti...,https://a0.muscache.com/im/pictures/miso/Hosti...,https://a0.muscache.com/im/pictures/miso/Hosti...,https://a0.muscache.com/im/pictures/58f6e95a-6...,https://a0.muscache.com/im/pictures/miso/Hosti...


In [11]:
search_df.to_csv('CLEANED scraped_listings BACKUP.csv')

# Read in Listings page webscrape.

In [12]:
listings_df = pd.read_csv('scraped_pages BACKUP.csv')
listings_df.drop(columns=['Unnamed: 0'], inplace=True)

  listings_df = pd.read_csv('scraped_pages BACKUP.csv')


# Extract the # of Amenities

In [13]:
def n_amenities(x):
    try:
        x = str(x).replace('amenity details','')
    except:
        pass
    if x != None or str(x) != 'amenity details':
        return str(x).replace('Show all ','').replace(' amenities','')
    else:
        return None

In [14]:
listings_df['n_amenities'] = listings_df['n_amenities'].apply(n_amenities)

# Extract the number of each type of amenity.

In [15]:
def amenity_type_count(x, value):
    try:
        dict_ = ast.literal_eval(x)
        if dict_[value]:
            return dict_[value]
        else:
            return 0
    except:
        return 0
    
listings_df['amenities_services'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Services'), axis=1)
listings_df['amenities_heat_cool'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Heating and cooling'), axis=1)
listings_df['amenities_bedroom_laundry'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Bedroom and laundry'), axis=1)
listings_df['amenities_kitchen_dining'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Kitchen and dining'), axis=1)
listings_df['amenities_safety'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Home safety'), axis=1)
listings_df['amenities_parking_facilities'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Parking and facilities'), axis=1)
listings_df['amenities_internet_office'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Internet and office'), axis=1)
listings_df['amenities_not_included'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Not included'), axis=1)
listings_df['amenities_entertainment'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Entertainment'), axis=1)
listings_df['amenities_bathroom'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Bathroom'), axis=1)
listings_df['amenities_outdoor'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Outdoor'), axis=1)
listings_df['amenities_location_features'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Location features'), axis=1)
listings_df['amenities_family'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Family'), axis=1)
listings_df['amenities_scenic_views'] = listings_df.apply(lambda x: amenity_type_count(x.amenities, 'Scenic views'), axis=1)

In [16]:
listings_df.drop(columns=['amenities'],inplace=True)

# Extract the number of each type of bed.

In [17]:
def bed_type_count(x, key_words):
    
    count = 0
    try:
        sleeping_arrangements = ast.literal_eval(x)
        for arrangement in sleeping_arrangements:
                beds = arrangement[1].replace(' bed','').split(', ')
                beds = [bed.split(' ') for bed in beds]

                for bed in beds:
                    bed_count = int(bed[0])
                    bed_type = bed[1]
                    if bed_type in key_words:
                        count += bed_count
        return count
    except:
        return None            
    

In [18]:
listings_df['bed_king'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['king','kings']), axis=1)
listings_df['bed_queen'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['queen', 'queens']), axis=1)
listings_df['bed_double'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['double', 'doubles'] ), axis=1)
listings_df['bed_single'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['single', 'singles', 'small']), axis=1)
listings_df['bed_sofa'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['sofa', 'sofas', 'couch', 'couches', 'futon', 'futons', 'trundle']), axis=1)
listings_df['bed_air'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['air']), axis=1)
listings_df['bed_floor'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['floor']), axis=1)
listings_df['bed_bunkbed'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['bunks', 'bunk']), axis=1)
listings_df['bed_hammock'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['hammock', 'hammocks']), axis=1)
listings_df['bed_baby'] = listings_df.apply(lambda x: bed_type_count(x.sleeping, ['crib', 'cribs', 'toddler', 'toddlers']), axis=1)


In [19]:
listings_df.drop(columns=['sleeping'],inplace=True)

# Extract insights about the description.

* Using `vaderSentiment`, extract the sentiment of the description. A positive sentiment is 1.00, and negative sentiment is -1.00.
* Fetch the length of the description.
* Using `readability`, extract the reading level of the description (in Flesch-Kincaid scores, returns grade level number).

In [20]:
def remove_html_tags(text):
    try:
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)
    except:
        return text
    
def description_sentiment_score(x):
    
    try:
        sentiment_analyzer = SentimentIntensityAnalyzer()
        sentiment_dict = sentiment_analyzer.polarity_scores(x)
        score = sentiment_dict['compound']
        return score
    except:
        return None

def description_length(x):
    
    try:
        length = len(x.split(' '))
        return length
    except:
        return None

def description_reading_level(x):
    try:
        r = Readability(x)
        level = r.flesch_kincaid().grade_level
        return level
    except:
        return None


In [21]:
listings_df['description'] = listings_df['description'].apply(remove_html_tags)
listings_df['description_sentiment_score'] = listings_df['description'].apply(description_sentiment_score)
listings_df['description_length'] = listings_df['description'].apply(description_length)
listings_df['description_reading_level'] = listings_df['description'].apply(description_reading_level)

In [22]:
listings_df.head()

Unnamed: 0,listing_id,n_guests,n_bedrooms,n_beds,n_baths,n_amenities,rating_cleanliness,rating_communication,rating_checkin,rating_accuracy,rating_location,rating_value,description,amenities_services,amenities_heat_cool,amenities_bedroom_laundry,amenities_kitchen_dining,amenities_safety,amenities_parking_facilities,amenities_internet_office,amenities_not_included,amenities_entertainment,amenities_bathroom,amenities_outdoor,amenities_location_features,amenities_family,amenities_scenic_views,bed_king,bed_queen,bed_double,bed_single,bed_sofa,bed_air,bed_floor,bed_bunkbed,bed_hammock,bed_baby,description_sentiment_score,description_length,description_reading_level
0,39368050.0,2.0,1.0,1.0,1.0,24.0,3.8,5.0,5.0,4.2,5.0,4.2,Relax and Comfortable Room,2,2,5,4,4,1,2,1,1,2,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7351,4.0,
1,5.535775e+17,8.0,4.0,4.0,2.5,60.0,4.8,5.0,5.0,4.9,4.9,4.7,Your family will be close to everything when y...,4,4,9,15,5,3,2,2,1,7,4,0,6,0,,,,,,,,,,,0.7269,83.0,
2,6.711613e+17,4.0,1.0,2.0,1.0,54.0,5.0,5.0,5.0,5.0,5.0,5.0,"Nestled on the Kiley Ranch trail, relax in a b...",3,2,8,15,5,3,2,0,2,7,2,2,1,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.8959,141.0,9.0
3,5.834106e+17,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,
4,5.82654e+17,5.0,2.0,2.0,1.0,45.0,4.5,4.8,4.9,4.8,4.8,4.7,Enjoy your stay and explore the wonderful city...,2,2,6,16,2,3,2,1,1,6,4,1,0,0,,,,,,,,,,,0.9819,74.0,


In [23]:
listings_df.to_csv('CLEANED scraped_pages BACKUP.csv')

# Appendix

In [24]:
# Used to discover the types of beds during the exploration phase.
"""
list_of_tuples = []

for i, row in listings_df[1:].iterrows():
    
    try:
        sleeping_rooms = ast.literal_eval(row['sleeping'])
        for room in sleeping_rooms:
            beds = room[1].replace(' bed','').split(', ')
            beds = [bed.split(' ') for bed in beds]
            for bed in beds:
                bed_count = int(bed[0])
                bed_type = bed[1]
                try:
                    list_of_tuples.append((bed_type, int(bed_count)))
                except Exception as e:
                    print(e)
                    
    except:
        pass

dict_ = {}

for bed in list_of_tuples:
    bed_type, count = bed
    if bed_type in dict_.keys():
        dict_[bed_type] += count
    else:
        dict_[bed_type] = count
            
dict_
"""

"\nlist_of_tuples = []\n\nfor i, row in listings_df[1:].iterrows():\n    \n    try:\n        sleeping_rooms = ast.literal_eval(row['sleeping'])\n        for room in sleeping_rooms:\n            beds = room[1].replace(' bed','').split(', ')\n            beds = [bed.split(' ') for bed in beds]\n            for bed in beds:\n                bed_count = int(bed[0])\n                bed_type = bed[1]\n                try:\n                    list_of_tuples.append((bed_type, int(bed_count)))\n                except Exception as e:\n                    print(e)\n                    \n    except:\n        pass\n\ndict_ = {}\n\nfor bed in list_of_tuples:\n    bed_type, count = bed\n    if bed_type in dict_.keys():\n        dict_[bed_type] += count\n    else:\n        dict_[bed_type] = count\n            \ndict_\n"