<div align="center">

# RIO Airbnb - Data Cleaning Jupyter Notebook 
**Latest Update:** _25th November 2022_

</div>

#### _Import required libraries_

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statistics as sts
import datetime as dt

import cleaninglib.functions as cln

#### *Import initial dataset*

In [None]:
listings_df = pd.read_csv("dataset/listings.csv", index_col=False)
listings_df.describe()

#### *Remove unnecessary data columns*

In [None]:
REDUNDANT_COLUMNS = [
    'listing_url', 'scrape_id', 'last_scraped', 'source', 'host_id', 'host_url', 'host_name', 'host_location', 'host_thumbnail_url', 'host_picture_url', 'host_verifications', 
    'host_identity_verified', 'neighbourhood', 'property_type', 'bathrooms', 'calendar_updated', 'calendar_last_scraped'
]
for col in REDUNDANT_COLUMNS:
    listings_df.pop(col)   

#### *Fill in & replace selected value columns*

In [None]:
PERCENTAGES = ['host_response_rate', "host_acceptance_rate"]
NULL_VALUES = {
    'license': 'none',
    'host_response_time': 'does not respond',
    'reviews_per_month': 0,
    'bathrooms_text': "0 private baths"
}

# Fill in null values in certain columns
for nul in NULL_VALUES:
    listings_df.loc[listings_df[nul].isnull(), nul] = NULL_VALUES[nul]

# Convert all values in 'license' to categorical values
listings_df.loc[listings_df["license"] == "Exempt", "license"] = "exempted"
listings_df.loc[~listings_df['license'].str.contains('none|exempted'), 'license'] = "have"

# Clean the 'price' column by removing the '$' and the ',' symbols as well as converting all values to "float" data type
listings_df['price'] = listings_df['price'].map(lambda x: float(x.lstrip('$').replace(",", "")))

#### *Convert values from `bathrooms_text` to individual columns of `bathroom_num` and `bathroom_type`*

In [None]:
# Divide
NEW_COLUMNS = ['bathroom_type', 'bathroom_num']

for col in NEW_COLUMNS:
    listings_df[col] = listings_df["bathrooms_text"].map(lambda x: cln.bath_clean(x)[col])

#### *Convert categorical columns to one-hot vectors*

In [None]:
TRUE_FALSE = ['has_availability', 'instant_bookable', "host_is_superhost"]
ONE_HOT_VECTORS = ['license']

# Convert all t/f columns to true/false & add them to the ONE_HOT_VECTORS list
for tf in TRUE_FALSE:
    listings_df.loc[listings_df[tf] == 't', tf] = f"{tf}_true"
    listings_df.loc[listings_df[tf] == 'f', tf] = f"{tf}_false"
    ONE_HOT_VECTORS.append(tf)

# Convert all categorical columns into one-hot vectors
for column in ONE_HOT_VECTORS:
    listings_df = listings_df.join(pd.get_dummies(listings_df[column]))
    listings_df.pop(column)

#### *Remove all $0 listings and listings with z-score over +/-2.5*

In [None]:
# Remove all listings with price of $0
listings_df = listings_df[listings_df['price'] != 0]

# Add a 'price_zscore' column
listings_df['price_zscore'] = stats.zscore(listings_df["price"])

# Remove all rows with a price z-score of more than +-2.5
listings_df = listings_df[abs(listings_df["price_zscore"]) < 2.5]

listings_df.describe()

<div align="center">

# *Dataset Abstraction*

#### Run the code below the view some abstract info about the data

</div>

In [None]:

prices = listings_df["price"].to_list()
col_num = len(listings_df.columns)
row_num = len(listings_df['name'])
max_revs = max(listings_df["number_of_reviews"])

max_price = max(prices)
min_price = min(prices)
mean = sts.mean(prices)
std_dev = sts.stdev(prices)

print(f"Maximum price in dataset: ${max_price:.02f}")
print(f"Minimum price in dataset: ${min_price:.02f}")
print(f"Mean of price in dataset: ${mean:.02f}")
print(f"Standard deviation of price in dataset: ${std_dev:.02f}\n")

print(f"Total # of columns after rough cleaning: {col_num}")
print(f"Total # of rows after rough cleaning: {row_num:,}\n")

print(f"Highest # of reviews for a listing: {max_revs:,}")

print("Max z-score: " + str(min(listings_df['price_zscore'])))

In [None]:
city_of_la = listings_df[listings_df['neighbourhood_group_cleansed'] == "City of Los Angeles"]

<div align="center">

# *References*

</div>

#### **Export DataFrame to CSV**
Please close Excel or any other application that's opening the CSV before exporting, otherwise you'll get `[Errno 13] Permission Denied`

In [None]:
listings_df.to_csv("dataset/cleaned_listings.csv", index=False)

#### **Check for unique variables in each column**

In [None]:
COLUMN_NAME = input("Enter column name here: ")


display_list = []
for row in listings_df[COLUMN_NAME]: 
    if row not in display_list:
        display_list.append(row)


display_list

#### **Find percentage in of certain values**

In [None]:
PERCENTAGES = {
    "sum": 0
}

listings_df.loc[listings_df["host_identity_verified"].isnull(), "host_identity_verified"] = "na"

for row in listings_df['host_identity_verified']:
    if row not in PERCENTAGES:
        PERCENTAGES[row] = 0
    PERCENTAGES[row] += 1
    PERCENTAGES["sum"] += 1
    
print("Percentage: " + str(
    PERCENTAGES["t"] / PERCENTAGES['sum']
))