### Read the data set

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from frozenlist import FrozenList


In [None]:
#read the data set

df = pd.read_csv('Dataset/Airbnb_Data.csv')

#display the first 5 rows of the data set
print(df.head())

#display the last 5 rows of the data set
print(df.tail())

#display the shape of the data set
print(df.shape)

#display the columns of the data set
print(df.columns)

#display the data types of the columns
print(df.dtypes)

#display the info of the data set
print(df.info())

#display the summary statistics of the data set
print(df.describe())



In [None]:
# for each row, we need to put the ammenities in a list 

df['amenities'] = df['amenities'].apply(lambda x: x.replace('{','').replace('}','').replace('"','').split(','))

#display the first 5 rows of the data set
print(df.head())




In [None]:
#replace the log price column with the price column

df['log_price'] = df['log_price'].apply(lambda x: np.exp(x))

#rename the log price column to price

df.rename(columns={'log_price':'price'}, inplace=True)

#export the data set to a new csv file

df.to_csv('Dataset/Airbnb_Data1.csv', index=False)


#display the first 5 rows of the data set
print(df.head())
#the newdf is used (after the markdown preprocess) for removing rows with prices greater than 500 ********
newdf=df.copy()


In [None]:
#get some statistics of the price column
print(df['price'].describe())

In [None]:
# get a set of all unique amenities
amenities = set()
for index, row in df.iterrows():
    for amenity in row['amenities']:
        amenities.add(amenity)

#display the unique amenities
print(amenities)


In [None]:
amenities.remove('')
print(len(amenities), 'amenities length')

In [None]:
#get unique intger value of the prices in the data set

prices = set()
for index, row in df.iterrows():
    prices.add(int(row['price']))

#display the unique prices
# 6901257  150.0     Apartment  Entire home/apt   
# 1   6304928  169.0     Apartment  Entire home/apt   
# 2   7919400  145.0     Apartment  Entire home/apt   
# 3  13418779  750.0         House  Entire home/apt   
# 4   3808709  115.0     Apartment  Entire home/apt 


In [None]:
print(prices)

#get the number of unique prices
print(len(prices), 'prices length')


In [None]:
#we have 640 unique prices, with minimum price of 1 USD and maximum price of 2000USD
#we can categorize the prices into 13 categories
#0-50, 51-100, 101-150, 151-200, 201-250, 251-300, 301-350, 351-400, 401-450, 451-500, 501-550, 551-600, 601-650
#650-700, 701-750, 751-800, 801-850, 851-900, 901-950, 951-1000, 1001-1050, 1051-1100, 1101-1150, 1151-1200, 1201-1250
#1251-1300, 1301-1350, 1351-1400, 1401-1450, 1451-1500, 1501-1550, 1551-1600, 1601-1650, 1651-1700, 1701-1750, 1751-1800
#1801-1850, 1851-1900, 1901-1950, 1951-2000

stringo = '''0-50, 51-100, 101-150, 151-200, 201-250, 251-300, 301-350, 351-400, 401-450, 451-500, 501-550, 551-600, 601-650, 650-700, 701-750, 751-800, 801-850, 851-900, 901-950, 951-1000, 1001-1050, 1051-1100, 1101-1150, 1151-1200, 1201-1250, 1251-1300, 1301-1350, 1351-1400, 1401-1450, 1451-1500, 1501-1550, 1551-1600, 1601-1650, 1651-1700, 1701-1750, 1751-1800, 1801-1850, 1851-1900, 1901-1950, 1951-2000'''

prices = stringo.split(',')
print(len(prices), 'prices length')
print(prices)


In [None]:
#change the prices to integers where the first value is the lower bound and the second value is the upper bound of the 40 categories
prices = [(int(price.split('-')[0]), int(price.split('-')[1])) for price in prices]
print(prices)



In [None]:
#add a new column to the data set called price_category
#the price category will be the category of the price of the listing

df['price_category'] = df['price'].apply(lambda x: [i for i in range(len(prices)) if prices[i][0] <= int(x) <= prices[i][1]][0])

#display the first 5 rows of the data set
print(df['price_category'])

In [None]:
# see the trend between the price category and the city
#use a frozen list to store the price categories in the data, as they are unhashable

unique_city = set()
for index, row in df.iterrows():
    unique_city.add(row['city'])

print(unique_city)

price_categories = FrozenList(df['price_category'])
price_categories = price_categories.freeze()
#plot the price category against the city
plt.figure(figsize=(20,10))
sns.countplot(x='city', hue=price_categories, data=df)
plt.title('Price Category vs City')
plt.show()
#for each city we can see the distribution of the price categories
for city in unique_city:
    plt.figure(figsize=(20,10))
    sns.countplot(x='city', hue=df['price_category'], data=df[df['city'] == city])
    plt.title('Price Category vs City')
    plt.show()




In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x=df['price_category'])
plt.title('Price Category')
plt.show()

In [None]:
#plot the price category against the room type
plt.figure(figsize=(20,10))
sns.countplot(x='room_type', hue=df['price_category'], data=df)
plt.title('Price Category vs Room Type')
plt.show()
# for each room type we can see the distribution of the price categories
for room_type in set(df['room_type']):
    plt.figure(figsize=(20,10))
    sns.countplot(x='room_type', hue=df['price_category'], data=df[df['room_type'] == room_type])
    plt.title('Price Category vs Room Type')
    plt.show()

In [None]:
#plot the price category against the bed type
plt.figure(figsize=(20,10))
sns.countplot(x='bed_type', data=df)
plt.title('Price Category vs Bed Type')
plt.show()

In [None]:
#plot prices 
plt.figure(figsize=(20,10))
sns.histplot(df['price'])
plt.title('Price Distribution')
plt.show()

In [None]:
for bed in set(df['bed_type']):
    plt.figure(figsize=(20,10))
    sns.countplot(x='bed_type',hue=df['price_category'], data=df[df['bed_type'] == bed])
    plt.title('Price Category vs Cancellation Policy')
    plt.show()

In [None]:
# plot the price category against the cancellation policy
plt.figure(figsize=(20,10))
sns.countplot(x='cancellation_policy',hue='price_category', data=df)
plt.title('Price Category vs Cancellation Policy')
plt.show()

In [None]:
#get the unique accomodates values
accomodates = set()
for index, row in df.iterrows():
    accomodates.add(row['accommodates'])
print(accomodates)

In [None]:
#plot the price category against the accomodates
plt.figure(figsize=(20,10))
sns.countplot(x='accommodates',hue=df['price_category'], data=df)
plt.title('Price Category vs Accomodates')
plt.show()

In [None]:
#get the unique bathrooms values
bathrooms = set()
for index, row in df.iterrows():
    bathrooms.add(row['bathrooms'])
print(bathrooms)

# Preprocessing

In [None]:
#preprocess
#drop the following columns
newdf.drop('host_has_profile_pic', axis=1, inplace=True)
newdf.drop('bed_type', axis=1, inplace=True)
newdf.drop('zipcode', axis=1, inplace=True)
newdf.drop('longitude', axis=1, inplace=True)
newdf.drop('latitude', axis=1, inplace=True)

In [None]:
newdf = newdf[newdf['price'] <= 500]

In [None]:
#plot prices 
plt.figure(figsize=(20,10))
sns.histplot(newdf['price'])
plt.title('Price Distribution')
plt.show()

In [None]:
newdf['price_category'] = pd.cut(newdf['price'], bins=50, labels=range(1, 51))
#0-10 usd , 11-20 usd , ...

# Prices categories

In [None]:
#plot the price category 
plt.figure(figsize=(20,10))
sns.countplot(x='price_category', data=newdf)
plt.title('Price Category')
plt.show()

# price and room type relations

In [None]:
#plot the price category against the room type
plt.figure(figsize=(20,10))
sns.countplot(x='room_type', hue=newdf['price_category'], data=newdf)
plt.title('Price Category vs Room Type')
plt.show()
# for each room type we can see the distribution of the price categories
for room_type in set(df['room_type']):
    plt.figure(figsize=(20,10))
    sns.countplot(x='room_type', hue=newdf['price_category'], data=newdf[df['room_type'] == room_type])
    plt.title('Price Category vs Room Type')
    plt.show()

# property type and price relations

In [None]:
#plot the price category against the room type
plt.figure(figsize=(20,10))
sns.countplot(x='property_type', hue=newdf['price_category'], data=newdf)
plt.title('Price Category vs Room Type')
plt.show()
# for each room type we can see the distribution of the price categories
for property_type in set(newdf['property_type']):
    plt.figure(figsize=(20,10))
    sns.countplot(x='property_type', hue=newdf['price_category'], data=newdf[newdf['property_type'] == property_type])
    plt.title('Price Category vs Room Type')
    plt.show()

# City and price relations

In [None]:
#for each city we can see the distribution of the price categories
for city in unique_city:
    plt.figure(figsize=(20,10))
    sns.countplot(x='city', hue=newdf['price_category'], data=newdf[newdf['city'] == city])
    plt.title('Price Category vs City')
    plt.show()