<div align="center">

# RIO Airbnb - Data Cleaning Jupyter Notebook 
**Latest Update:** _25th November 2022_

</div>

#### _Import required libraries_

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import OneHotEncoder

import airbnblib.cleaning as cln

#### *Import initial dataset*

In [2]:
listings_df = pd.read_csv("dataset/listings.csv", index_col=False)
listings_df.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,45815.0,45815.0,45815.0,45802.0,45802.0,45815.0,45815.0,45815.0,0.0,42055.0,...,34968.0,34961.0,34967.0,34958.0,34957.0,45815.0,45815.0,45815.0,45815.0,35236.0
mean,1.641198e+17,20220900000000.0,149639400.0,86.630758,116.893957,33.990937,-118.239983,4.1374,,1.864749,...,4.704121,4.84075,4.823367,4.798635,4.676207,22.099596,17.293878,4.399607,0.363789,1.577828
std,2.798853e+17,0.0,147096900.0,494.785019,640.543978,0.190502,0.236575,2.916169,,1.224218,...,0.498639,0.410545,0.438933,0.407368,0.493803,81.801488,76.83389,30.262457,3.982576,1.89404
min,109.0,20220900000000.0,521.0,1.0,1.0,33.33848,-118.96188,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01
25%,23924760.0,20220900000000.0,24132480.0,1.0,2.0,33.897055,-118.39636,2.0,,1.0,...,4.64,4.86,4.83,4.76,4.6,1.0,1.0,0.0,0.0,0.27
50%,45205690.0,20220900000000.0,99696110.0,3.0,4.0,34.04406,-118.31492,4.0,,1.0,...,4.86,4.97,4.97,4.92,4.81,2.0,1.0,0.0,0.0,0.89
75%,5.492515e+17,20220900000000.0,250312600.0,11.0,18.0,34.100115,-118.086906,6.0,,2.0,...,5.0,5.0,5.0,5.0,4.96,9.0,5.0,1.0,0.0,2.35
max,7.11848e+17,20220900000000.0,478604900.0,4031.0,20000.0,34.82206,-117.62779,16.0,,24.0,...,5.0,5.0,5.0,5.0,5.0,638.0,638.0,342.0,76.0,86.21


#### *Remove unnecessary data columns*

In [None]:
REDUNDANT_COLUMNS = [
    'listing_url', 'scrape_id', 'last_scraped', 'source', 'host_id', 'host_url', 'host_name', 'host_location', 'host_thumbnail_url', 'host_picture_url', 'host_verifications', 
    'host_identity_verified', 'neighbourhood', 'property_type', 'bathrooms', 'calendar_updated', 'calendar_last_scraped'
]
for col in REDUNDANT_COLUMNS:
    listings_df.pop(col)   

#### *Fill in & replace selected value columns*

In [None]:
PERCENTAGES = ['host_response_rate', "host_acceptance_rate"]
NULL_VALUES = {
    'license': 'none',
    'host_response_time': 'does not respond',
    'reviews_per_month': 0,
    'bathrooms_text': "0 private baths"
}

# Fill in null values in certain columns
for nul in NULL_VALUES:
    listings_df.loc[listings_df[nul].isnull(), nul] = NULL_VALUES[nul]

# Convert all values in 'license' to categorical values
listings_df.loc[listings_df["license"] == "Exempt", "license"] = "exempted"
listings_df.loc[~listings_df['license'].str.contains('none|exempted'), 'license'] = "have"

# Clean the 'price' column by removing the '$' and the ',' symbols as well as converting all values to "float" data type
listings_df['price'] = listings_df['price'].map(lambda x: float(x.lstrip('$').replace(",", "")))

#### *Convert values from `bathrooms_text` to individual columns of `bathroom_num (float)` and `bathroom_type (vectors)`*

In [None]:
# Divide `bathrooms_text` column into `num` & `type`
NEW_COLUMNS = ['bathroom_type', 'bathroom_num']
for col in NEW_COLUMNS:
    listings_df[col] = listings_df["bathrooms_text"].map(lambda x: cln.bath_clean(x)[col])

listings_df = listings_df.join(pd.get_dummies(listings_df["bathroom_type"]))
listings_df.pop("bathrooms_text")

#### *Convert categorical columns to one-hot vectors*

In [None]:
TRUE_FALSE = ['has_availability', 'instant_bookable', "host_is_superhost"]
ONE_HOT_VECTORS = ['license']

# Convert all t/f columns to true/false & add them to the ONE_HOT_VECTORS list
for tf in TRUE_FALSE:
    listings_df.loc[listings_df[tf] == 't', tf] = f"{tf}_true"
    listings_df.loc[listings_df[tf] == 'f', tf] = f"{tf}_false"
    ONE_HOT_VECTORS.append(tf)

# Convert all categorical columns into one-hot vectors
for column in ONE_HOT_VECTORS:
    listings_df = listings_df.join(pd.get_dummies(listings_df[column]))
    listings_df.pop(column)

#### *Remove all $0 listings and listings with z-score over +/-2.5*

In [None]:
# Remove all listings with price of $0
listings_df = listings_df[listings_df['price'] != 0]

# Add a 'price_zscore' column
listings_df['price_zscore'] = stats.zscore(listings_df["price"])

# Remove all rows with a price z-score of more than +-2.5
listings_df = listings_df[abs(listings_df["price_zscore"]) < 2.5]

listings_df.describe()

#### *Convert all `amenities` rows to `list[str]` & eliminate all values within each amenities list that are less than 50% frequency*

In [None]:
# For more documentation on these functions, check `airbnblib/cleaning.py`
listings_df['amenities'] = [cln.amenities_clean(row) for row in listings_df['amenities']]
listings_df['amenities'] = cln.amenities_freq(listings_df['amenities'])

#### *Extract listings zip code using using `latitude` and `longitude` columns*

In [4]:
# For more documentation on these functions, check `airbnblib/cleaning.py`
listings_df["address"] = [
    cln.get_zip_code(listings_df['latitude'].to_list()[i], listings_df['longitude'].to_list()[i])
    for i in range(45815)
]

<h2 align="center"><em>Tools for Referencing</em></h2>

#### **Export DataFrame to CSV**
Please close Excel or any other application that's opening the destination CSV before exporting, otherwise you'll get `[Errno 13] Permission Denied`

In [None]:
listings_df.to_csv("dataset/cleaned_listings.csv", index=False)

#### **Check for unique variables in each column**

In [None]:
col_name = input("Enter column name here: ")
display_list = []
for row in listings_df[col_name]: 
    if row not in display_list:
        display_list.append(row)

display_list

#### **Find percentage in of certain values**

In [None]:
PERCENTAGES = {
    "sum": 0
}

listings_df.loc[listings_df["host_identity_verified"].isnull(), "host_identity_verified"] = "na"

for row in listings_df['host_identity_verified']:
    if row not in PERCENTAGES:
        PERCENTAGES[row] = 0
    PERCENTAGES[row] += 1
    PERCENTAGES["sum"] += 1
    
print("Percentage: " + str(
    PERCENTAGES["t"] / PERCENTAGES['sum']
))

#### **Find data type of certain columns**

In [None]:
col_name = input("Enter column name here: ")
print(type(listings_df[col_name].to_list()[0]))