### Read the data set

In [38]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
#read the data set

df = pd.read_csv('Dataset/Airbnb_Data.csv')

#display the first 5 rows of the data set
print(df.head())

#display the last 5 rows of the data set
print(df.tail())

#display the shape of the data set
print(df.shape)

#display the columns of the data set
print(df.columns)

#display the data types of the columns
print(df.dtypes)

#display the info of the data set
print(df.info())

#display the summary statistics of the data set
print(df.describe())



         id  log_price property_type        room_type  \
0   6901257   5.010635     Apartment  Entire home/apt   
1   6304928   5.129899     Apartment  Entire home/apt   
2   7919400   4.976734     Apartment  Entire home/apt   
3  13418779   6.620073         House  Entire home/apt   
4   3808709   4.744932     Apartment  Entire home/apt   

                                           amenities  accommodates  bathrooms  \
0  {"Wireless Internet","Air conditioning",Kitche...             3        1.0   
1  {"Wireless Internet","Air conditioning",Kitche...             7        1.0   
2  {TV,"Cable TV","Wireless Internet","Air condit...             5        1.0   
3  {TV,"Cable TV",Internet,"Wireless Internet",Ki...             4        1.0   
4  {TV,Internet,"Wireless Internet","Air conditio...             2        1.0   

   bed_type cancellation_policy  cleaning_fee  ...   latitude   longitude  \
0  Real Bed              strict          True  ...  40.696524  -73.991617   
1  Real Bed     

In [40]:
# for each row, we need to put the ammenities in a list 

df['amenities'] = df['amenities'].apply(lambda x: x.replace('{','').replace('}','').replace('"','').split(','))

#display the first 5 rows of the data set
print(df.head())




         id  log_price property_type        room_type  \
0   6901257   5.010635     Apartment  Entire home/apt   
1   6304928   5.129899     Apartment  Entire home/apt   
2   7919400   4.976734     Apartment  Entire home/apt   
3  13418779   6.620073         House  Entire home/apt   
4   3808709   4.744932     Apartment  Entire home/apt   

                                           amenities  accommodates  bathrooms  \
0  [Wireless Internet, Air conditioning, Kitchen,...             3        1.0   
1  [Wireless Internet, Air conditioning, Kitchen,...             7        1.0   
2  [TV, Cable TV, Wireless Internet, Air conditio...             5        1.0   
3  [TV, Cable TV, Internet, Wireless Internet, Ki...             4        1.0   
4  [TV, Internet, Wireless Internet, Air conditio...             2        1.0   

   bed_type cancellation_policy  cleaning_fee  ...   latitude   longitude  \
0  Real Bed              strict          True  ...  40.696524  -73.991617   
1  Real Bed     

In [41]:
#replace the log price column with the price column

df['log_price'] = df['log_price'].apply(lambda x: np.exp(x))

#rename the log price column to price

df.rename(columns={'log_price':'price'}, inplace=True)

#display the first 5 rows of the data set
print(df.head())


         id  price property_type        room_type  \
0   6901257  150.0     Apartment  Entire home/apt   
1   6304928  169.0     Apartment  Entire home/apt   
2   7919400  145.0     Apartment  Entire home/apt   
3  13418779  750.0         House  Entire home/apt   
4   3808709  115.0     Apartment  Entire home/apt   

                                           amenities  accommodates  bathrooms  \
0  [Wireless Internet, Air conditioning, Kitchen,...             3        1.0   
1  [Wireless Internet, Air conditioning, Kitchen,...             7        1.0   
2  [TV, Cable TV, Wireless Internet, Air conditio...             5        1.0   
3  [TV, Cable TV, Internet, Wireless Internet, Ki...             4        1.0   
4  [TV, Internet, Wireless Internet, Air conditio...             2        1.0   

   bed_type cancellation_policy  cleaning_fee  ...   latitude   longitude  \
0  Real Bed              strict          True  ...  40.696524  -73.991617   
1  Real Bed              strict         

In [42]:
# get a set of all unique amenities
amenities = set()
for index, row in df.iterrows():
    for amenity in row['amenities']:
        amenities.add(amenity)

#display the unique amenities
print(amenities)


{'', 'Doorman Entry', 'Keypad', 'Dryer', 'translation missing: en.hosting_amenity_49', 'Wide hallway clearance', 'Private entrance', 'Wide entryway', 'Pets allowed', 'Hair dryer', 'Crib', 'Toilet paper', 'Stove', 'Bathtub with shower chair', 'Table corner guards', 'Luggage dropoff allowed', 'Firm matress', 'Dishes and silverware', 'Children’s books and toys', 'Essentials', 'Free parking on premises', 'Safety card', 'Doorman', '24-hour check-in', 'BBQ grill', 'Laptop friendly workspace', 'Ground floor access', 'Beachfront', 'Oven', 'Pets live on this property', 'Smoke detector', 'Carbon monoxide detector', 'Lake access', 'Microwave', 'Window guards', 'Wide clearance to shower and toilet', 'Internet', 'Smart lock', 'Wide doorway', 'High chair', 'Pack ’n Play/travel crib', 'Lock on bedroom door', 'Accessible-height bed', 'Indoor fireplace', 'Bed linens', 'Cable TV', 'Fireplace guards', 'Flat smooth pathway to front door', 'Room-darkening shades', 'Air conditioning', 'Wide clearance to bed

In [43]:
amenities.remove('')
print(len(amenities), 'amenities length')

130 amenities length


In [44]:
#get unique intger value of the prices in the data set

prices = set()
for index, row in df.iterrows():
    prices.add(int(row['price']))

#display the unique prices
# 6901257  150.0     Apartment  Entire home/apt   
# 1   6304928  169.0     Apartment  Entire home/apt   
# 2   7919400  145.0     Apartment  Entire home/apt   
# 3  13418779  750.0         House  Entire home/apt   
# 4   3808709  115.0     Apartment  Entire home/apt 


In [45]:
print(prices)

#get the number of unique prices
print(len(prices), 'prices length')


{1, 5, 10, 11, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 34, 36, 37, 38, 39, 41, 42, 44, 45, 46, 48, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 74, 76, 78, 79, 81, 82, 83, 85, 87, 88, 89, 90, 92, 94, 95, 97, 98, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 113, 114, 115, 117, 118, 119, 121, 122, 124, 125, 126, 127, 128, 129, 130, 132, 133, 135, 136, 137, 138, 139, 141, 142, 144, 146, 147, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 166, 167, 169, 171, 172, 173, 175, 177, 179, 181, 182, 184, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 209, 211, 213, 214, 216, 217, 219, 220, 221, 223, 224, 225, 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 239, 241, 242, 244, 245, 247, 248, 249, 251, 252, 253, 254, 255, 257, 258, 260, 262, 263, 264, 266, 267, 268, 270, 271, 272, 273, 274, 275, 277, 278, 279, 280, 281, 282, 284, 285, 286, 288, 289, 291, 

In [46]:
#we have 640 unique prices, with minimum price of 1 USD and maximum price of 2000USD
#we can categorize the prices into 13 categories
#0-50, 51-100, 101-150, 151-200, 201-250, 251-300, 301-350, 351-400, 401-450, 451-500, 501-550, 551-600, 601-650
#650-700, 701-750, 751-800, 801-850, 851-900, 901-950, 951-1000, 1001-1050, 1051-1100, 1101-1150, 1151-1200, 1201-1250
#1251-1300, 1301-1350, 1351-1400, 1401-1450, 1451-1500, 1501-1550, 1551-1600, 1601-1650, 1651-1700, 1701-1750, 1751-1800
#1801-1850, 1851-1900, 1901-1950, 1951-2000

stringo = '''0-50, 51-100, 101-150, 151-200, 201-250, 251-300, 301-350, 351-400, 401-450, 451-500, 501-550, 551-600, 601-650, 650-700, 701-750, 751-800, 801-850, 851-900, 901-950, 951-1000, 1001-1050, 1051-1100, 1101-1150, 1151-1200, 1201-1250, 1251-1300, 1301-1350, 1351-1400, 1401-1450, 1451-1500, 1501-1550, 1551-1600, 1601-1650, 1651-1700, 1701-1750, 1751-1800, 1801-1850, 1851-1900, 1901-1950, 1951-2000'''

prices = stringo.split(',')
print(len(prices), 'prices length')
print(prices)


40 prices length
['0-50', ' 51-100', ' 101-150', ' 151-200', ' 201-250', ' 251-300', ' 301-350', ' 351-400', ' 401-450', ' 451-500', ' 501-550', ' 551-600', ' 601-650', ' 650-700', ' 701-750', ' 751-800', ' 801-850', ' 851-900', ' 901-950', ' 951-1000', ' 1001-1050', ' 1051-1100', ' 1101-1150', ' 1151-1200', ' 1201-1250', ' 1251-1300', ' 1301-1350', ' 1351-1400', ' 1401-1450', ' 1451-1500', ' 1501-1550', ' 1551-1600', ' 1601-1650', ' 1651-1700', ' 1701-1750', ' 1751-1800', ' 1801-1850', ' 1851-1900', ' 1901-1950', ' 1951-2000']


In [47]:
#change the prices to integers where the first value is the lower bound and the second value is the upper bound of the 40 categories
prices = [(int(price.split('-')[0]), int(price.split('-')[1])) for price in prices]
print(prices)



[(0, 50), (51, 100), (101, 150), (151, 200), (201, 250), (251, 300), (301, 350), (351, 400), (401, 450), (451, 500), (501, 550), (551, 600), (601, 650), (650, 700), (701, 750), (751, 800), (801, 850), (851, 900), (901, 950), (951, 1000), (1001, 1050), (1051, 1100), (1101, 1150), (1151, 1200), (1201, 1250), (1251, 1300), (1301, 1350), (1351, 1400), (1401, 1450), (1451, 1500), (1501, 1550), (1551, 1600), (1601, 1650), (1651, 1700), (1701, 1750), (1751, 1800), (1801, 1850), (1851, 1900), (1901, 1950), (1951, 2000)]


IndexError: list index out of range

In [50]:
#add a new column to the data set called price_category
#the price category will be the category of the price of the listing

df['price_category'] = df['price'].apply(lambda x: [i for i in range(len(prices)) if prices[i][0] <= x <= prices[i][1]])


print(df['price_category'])

KeyboardInterrupt: 