Here we will try to identify 3 most frequent and 3 least frequent characteristics of the properties rented. This is really interesting in order to get the understanding of the nature of rented properties.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Let's read the listings file and look at the columns to find out what column contains the
# list of charachteristics of the property.
df = pd.read_csv('archive/listings.csv')
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

We can see that the column 'amenities' is probably what do we need.

In [3]:
# Below I create the list of features presented in all records of the listing dataframe.
# In order to avoid duplicate records I'll made this in for set.
set_of_features = set()
for i in range(0, df.shape[0]):
    for j in range(0, len(df.amenities[i][1:-1].split(','))):
        set_of_features.add(df.amenities[i][1:-1].split(',')[j])

In [4]:
# Here is our set:
set_of_features

{'',
 '"24-Hour Check-in"',
 '"Air Conditioning"',
 '"Buzzer/Wireless Intercom"',
 '"Cable TV"',
 '"Carbon Monoxide Detector"',
 '"Elevator in Building"',
 '"Family/Kid Friendly"',
 '"Fire Extinguisher"',
 '"First Aid Kit"',
 '"Free Parking on Premises"',
 '"Hair Dryer"',
 '"Hot Tub"',
 '"Indoor Fireplace"',
 '"Laptop Friendly Workspace"',
 '"Lock on Bedroom Door"',
 '"Other pet(s)"',
 '"Pets Allowed"',
 '"Pets live on this property"',
 '"Safety Card"',
 '"Smoke Detector"',
 '"Smoking Allowed"',
 '"Suitable for Events"',
 '"Washer / Dryer"',
 '"Wheelchair Accessible"',
 '"Wireless Internet"',
 'Breakfast',
 'Cat(s)',
 'Dog(s)',
 'Doorman',
 'Dryer',
 'Essentials',
 'Gym',
 'Hangers',
 'Heating',
 'Internet',
 'Iron',
 'Kitchen',
 'Pool',
 'Shampoo',
 'TV',
 'Washer'}

In [5]:
# Now let's count how many time each feature from the set is mentioned in the property list.
# We will record it in the form of dictionary.
counter = {}
for i in range(0, df.shape[0]):
    for j in range(0, len(df.amenities[i][1:-1].split(','))):
        if df.amenities[i][1:-1].split(',')[j] in counter:
            counter[df.amenities[i][1:-1].split(',')[j]] += 1
        else:
            counter[df.amenities[i][1:-1].split(',')[j]] = 1

In [6]:
# Here is our counter 
counter

{'TV': 2574,
 '"Cable TV"': 1446,
 'Internet': 2811,
 '"Wireless Internet"': 3667,
 '"Air Conditioning"': 677,
 'Kitchen': 3423,
 'Heating': 3627,
 '"Family/Kid Friendly"': 1963,
 'Washer': 2992,
 'Dryer': 2997,
 '"Free Parking on Premises"': 2167,
 '"Buzzer/Wireless Intercom"': 538,
 '"Smoke Detector"': 3281,
 '"Carbon Monoxide Detector"': 2485,
 '"First Aid Kit"': 1680,
 '"Safety Card"': 727,
 '"Fire Extinguisher"': 2196,
 'Essentials': 3237,
 '"Pets Allowed"': 472,
 '"Pets live on this property"': 883,
 'Dog(s)': 509,
 'Cat(s)': 382,
 '"Hot Tub"': 303,
 '"Indoor Fireplace"': 886,
 'Shampoo': 2670,
 'Breakfast': 291,
 '"24-Hour Check-in"': 616,
 'Hangers': 846,
 '"Hair Dryer"': 774,
 'Iron': 742,
 '"Laptop Friendly Workspace"': 745,
 '"Suitable for Events"': 209,
 '"Elevator in Building"': 785,
 '"Lock on Bedroom Door"': 100,
 '"Wheelchair Accessible"': 300,
 'Gym': 442,
 '': 45,
 'Pool': 159,
 '"Smoking Allowed"': 82,
 '"Other pet(s)"': 51,
 'Doorman': 85,
 '"Washer / Dryer"': 2}

In [7]:
# In order to obtain most and least frequent features we need to sort the dictionary by values.
# So the most frequent features are:
list(dict(sorted(counter.items(), key=lambda item: item[1], reverse=True)))[0:3]

['"Wireless Internet"', 'Heating', 'Kitchen']

In [8]:
# And the most rare feature are (4 items since one of them is empty record):
list(dict(sorted(counter.items(), key=lambda item: item[1], reverse=True)))[-4:]

['"Smoking Allowed"', '"Other pet(s)"', '', '"Washer / Dryer"']