<div align="center">

# RIO Airbnb - Data Cleaning Jupyter Notebook 
**Latest Update:** _17th November 2022_

</div>

#### _Import required libraries_

In [80]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statistics as sts
import datetime as dt

#### *Import initial dataset*

In [82]:
listings_df = pd.read_csv("dataset/listings.csv", index_col=False)
listings_df.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,45815.0,45815.0,45815.0,45802.0,45802.0,45815.0,45815.0,45815.0,0.0,42055.0,...,34968.0,34961.0,34967.0,34958.0,34957.0,45815.0,45815.0,45815.0,45815.0,35236.0
mean,1.641198e+17,20220900000000.0,149639400.0,86.630758,116.893957,33.990937,-118.239983,4.1374,,1.864749,...,4.704121,4.84075,4.823367,4.798635,4.676207,22.099596,17.293878,4.399607,0.363789,1.577828
std,2.798853e+17,0.0,147096900.0,494.785019,640.543978,0.190502,0.236575,2.916169,,1.224218,...,0.498639,0.410545,0.438933,0.407368,0.493803,81.801488,76.83389,30.262457,3.982576,1.89404
min,109.0,20220900000000.0,521.0,1.0,1.0,33.33848,-118.96188,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01
25%,23924760.0,20220900000000.0,24132480.0,1.0,2.0,33.897055,-118.39636,2.0,,1.0,...,4.64,4.86,4.83,4.76,4.6,1.0,1.0,0.0,0.0,0.27
50%,45205690.0,20220900000000.0,99696110.0,3.0,4.0,34.04406,-118.31492,4.0,,1.0,...,4.86,4.97,4.97,4.92,4.81,2.0,1.0,0.0,0.0,0.89
75%,5.492515e+17,20220900000000.0,250312600.0,11.0,18.0,34.100115,-118.086906,6.0,,2.0,...,5.0,5.0,5.0,5.0,4.96,9.0,5.0,1.0,0.0,2.35
max,7.11848e+17,20220900000000.0,478604900.0,4031.0,20000.0,34.82206,-117.62779,16.0,,24.0,...,5.0,5.0,5.0,5.0,5.0,638.0,638.0,342.0,76.0,86.21


#### *Remove unnecessary data columns*

In [68]:
REDUNDANT_COLUMNS = [
    'listing_url', 'scrape_id', 'last_scraped', 'source', 'host_id', 'host_url', 'host_name', 'host_location', 'host_thumbnail_url', 'host_picture_url', 'host_verifications', 
    'host_identity_verified', 'neighbourhood', 'property_type', 'bathrooms', 'calendar_updated', 'calendar_last_scraped'
]
for col in REDUNDANT_COLUMNS:
    listings_df.pop(col)   

#### *Fill in & replace selected value columns*

In [69]:
TRUE_FALSE = ['has_availability', 'instant_bookable', "host_is_superhost"]
PERCENTAGES = ['host_response_rate', "host_acceptance_rate"]
ONE_HOT_VECTORS = ['license']
NULL_VALUES = {
    'license': 'none',
    'host_response_time': 'does not respond',
    'reviews_per_month': 0,
    'bathrooms_text': "0 private baths"
}

# Fill in null values in certain columns
for nul in NULL_VALUES:
    listings_df.loc[listings_df[nul].isnull(), nul] = NULL_VALUES[nul]

# Convert all values in 'license' to categorical values
listings_df.loc[listings_df["license"] == "Exempt", "license"] = "exempted"
listings_df.loc[~listings_df['license'].str.contains('none|exempted'), 'license'] = "have"

# Convert all t/f columns to true/false & add them to the ONE_HOT_VECTORS list
for tf in TRUE_FALSE:
    listings_df.loc[listings_df[tf] == 't', tf] = f"{tf}_true"
    listings_df.loc[listings_df[tf] == 'f', tf] = f"{tf}_false"
    ONE_HOT_VECTORS.append(tf)

# Clean the 'price' column by removing the '$' and the ',' symbols as well as converting all values to "float" data type
listings_df['price'] = listings_df['price'].map(lambda x: float(x.lstrip('$').replace(",", "")))

#### *Convert categorical columns to one-hot vectors*

In [70]:
# Convert all categorical columns into one-hot vectors
for column in ONE_HOT_VECTORS:
    listings_df = listings_df.join(pd.get_dummies(listings_df[column]))
    listings_df.pop(column)

#### *Convert values from `bathrooms_text` to individual columns of `bathrooms_count` and `bathrooms_type`*

In [71]:
def bath_separator(string: str) -> dict:
    """
    Convert strings from `bathrooms_text` to counts and type  
    
    ### Parameters:
        `string`: the string we want to convert
    """
    bath_arr = string.split(" ")
    try:
        count = float(bath_arr[0])
    except ValueError:
        bath_arr = ["1"] + bath_arr
        count = float(bath_arr[0])
    
    type = " ".join(bath_arr[1:]).lower()
    
    return {
        "bathrooms_count": count,
        "bathrooms_type": type,
    }


listings_df["bathrooms_count"] = listings_df['bathrooms_text'].map(lambda x: bath_separator(x)["bathrooms_count"])
listings_df["bathrooms_type"] = listings_df['bathrooms_text'].map(lambda x: bath_separator(x)["bathrooms_type"].rstrip("s"))


#### *Drop any missing rows*

In [72]:
# Remove all listings with price of $0
listings_df = listings_df[listings_df['price'] != 0]

# Add a 'price_zscore' column
listings_df['price_zscore'] = stats.zscore(listings_df["price"])

# Remove all rows with a price z-score of more than +-2.5
listings_df = listings_df[abs(listings_df["price_zscore"]) < 2.5]

listings_df.describe()

Unnamed: 0,id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,price,minimum_nights,...,have,none,has_availability_false,has_availability_true,instant_bookable_false,instant_bookable_true,host_is_superhost_false,host_is_superhost_true,bathrooms_count,price_zscore
count,45214.0,45201.0,45201.0,45214.0,45214.0,45214.0,41484.0,44445.0,45214.0,45214.0,...,45214.0,45214.0,45214.0,45214.0,45214.0,45214.0,45214.0,45214.0,45214.0,45214.0
mean,1.637819e+17,87.481582,117.915997,33.990167,-118.237441,4.067789,1.825523,2.280031,232.262858,17.878113,...,0.237471,0.739129,0.07856,0.92144,0.619498,0.380502,0.702327,0.296921,1.586964,-0.080546
std,2.796691e+17,497.968456,644.653461,0.190991,0.23564,2.831597,1.164331,1.71696,259.203331,30.248155,...,0.425538,0.439114,0.269053,0.269053,0.485516,0.485516,0.45724,0.456907,0.998839,0.359869
min,109.0,1.0,1.0,33.33848,-118.9617,1.0,1.0,1.0,10.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.389128
25%,23810520.0,1.0,2.0,33.892572,-118.394588,2.0,1.0,1.0,90.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.278058
50%,45173650.0,3.0,4.0,34.04369,-118.312715,3.0,1.0,2.0,150.0,7.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,-0.194757
75%,5.487822e+17,11.0,18.0,34.099947,-118.080682,6.0,2.0,3.0,264.0,30.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,-0.036483
max,7.11848e+17,4031.0,20000.0,34.82206,-117.62779,16.0,24.0,32.0,2085.0,1124.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0,2.491728


<div align="center">

# *Dataset Abstraction*

#### Run the code below the view some abstract info about the data

</div>

In [65]:

prices = listings_df["price"].to_list()
col_num = len(listings_df.columns)
row_num = len(listings_df['name'])
max_revs = max(listings_df["number_of_reviews"])

max_price = max(prices)
min_price = min(prices)
mean = sts.mean(prices)
std_dev = sts.stdev(prices)

print(f"Maximum price in dataset: ${max_price:.02f}")
print(f"Minimum price in dataset: ${min_price:.02f}")
print(f"Mean of price in dataset: ${mean:.02f}")
print(f"Standard deviation of price in dataset: ${std_dev:.02f}\n")

print(f"Total # of columns after rough cleaning: {col_num}")
print(f"Total # of rows after rough cleaning: {row_num:,}\n")

print(f"Highest # of reviews for a listing: {max_revs:,}")

print("Max z-score: " + str(min(listings_df['price_zscore'])))

Maximum price in dataset: $2085.00
Minimum price in dataset: $10.00
Mean of price in dataset: $232.26
Standard deviation of price in dataset: $259.20

Total # of columns after rough cleaning: 66
Total # of rows after rough cleaning: 45,214

Highest # of reviews for a listing: 1,702
Max z-score: -0.3891275699182149


In [None]:
city_of_la = listings_df[listings_df['neighbourhood_group_cleansed'] == "City of Los Angeles"]

<div align="center">

# *References*

</div>

#### **Export DataFrame to CSV**
Please close Excel or any other application that's opening the CSV before exporting, otherwise you'll get `[Errno 13] Permission Denied`

In [73]:
listings_df.to_csv("dataset/cleaned_listings.csv", index=False)

#### **Check for unique variables in each column**

In [74]:
COLUMN_NAME = input("Enter column name here: ")


display_list = []
for row in listings_df[COLUMN_NAME]: 
    if row not in display_list:
        display_list.append(row)


display_list

['bath',
 'shared bath',
 'private bath',
 'half-bath',
 'shared half-bath',
 'private half-bath']

#### **Find percentage in of certain values**

In [84]:
PERCENTAGES = {
    "sum": 0
}

listings_df.loc[listings_df["host_identity_verified"].isnull(), "host_identity_verified"] = "na"

for row in listings_df['host_identity_verified']:
    if row not in PERCENTAGES:
        PERCENTAGES[row] = 0
    PERCENTAGES[row] += 1
    PERCENTAGES["sum"] += 1
    
print("Percentage: " + str(
    PERCENTAGES["t"] / PERCENTAGES['sum']
))

Percentage: 0.8579067990832697
