In [35]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# statistical analysis
from scipy.stats import ttest_ind
import re
from scipy import stats

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
## Load the file into a pandas DataFrame
df = pd.read_csv('airbnb_listings.csv')
df

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,4.407700e+04,https://www.airbnb.com/rooms/44077,2.020000e+13,12/09/2022,city scrape,cosy comfortable Irish cottage twin,Our house was built in 1937 when there was ple...,I like our neighbourhood as there is no shorta...,https://a0.muscache.com/pictures/525706/050a3a...,193005,...,4.93,4.66,4.82,,f,2,0,2,0,1.85
1,8.515600e+04,https://www.airbnb.com/rooms/85156,2.020000e+13,12/09/2022,city scrape,Cosy Comfortable Irish Cottage 1 Double Bed,Our Cottage is a charming light filled cottage...,"I love Dundrum and its surrounding areas ,<br ...",https://a0.muscache.com/pictures/1749253/9ed2a...,193005,...,4.88,4.64,4.78,,f,2,0,2,0,1.53
2,1.598890e+05,https://www.airbnb.com/rooms/159889,2.020000e+13,12/09/2022,city scrape,Friendly Single Room,Washing can be done at a cost of €5 per load....,Plenty of buses into the city and the area is ...,https://a0.muscache.com/pictures/3031697/a8259...,766611,...,4.90,4.63,4.74,,f,3,0,3,0,2.78
3,1.628090e+05,https://www.airbnb.com/rooms/162809,2.020000e+13,11/09/2022,city scrape,5.5 miles south of Dublin City :^),A nice place to relax after the bustle of the ...,"Close to the sea, hill walks and the city - vi...",https://a0.muscache.com/pictures/86694529/c07b...,777681,...,4.97,4.77,4.85,,f,2,0,2,0,3.68
4,1.658280e+05,https://www.airbnb.com/rooms/165828,2.020000e+13,11/09/2022,city scrape,Pádraig Pearse apt. Kilmainham,"Don't just visit Dublin, experience Dublin in ...","Enjoy a walk along the grand canal, or just ob...",https://a0.muscache.com/pictures/34311419/b3f6...,790601,...,4.75,4.34,4.55,,t,5,5,0,0,0.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7561,7.130000e+17,https://www.airbnb.com/rooms/712909401324393615,2.020000e+13,12/09/2022,city scrape,Comfortable double room with private bathroom,Comfortable double room with private bathroom ...,,https://a0.muscache.com/pictures/miso/Hosting-...,38652370,...,,,,,f,1,0,1,0,
7562,7.130000e+17,https://www.airbnb.com/rooms/712914322433199115,2.020000e+13,12/09/2022,city scrape,Welcome 1-bedroom available. Free parking & Wi-Fi,Single room available in family home. Bus rout...,,https://a0.muscache.com/pictures/4c044921-f6fd...,89156390,...,,,,,f,2,0,2,0,
7563,7.130000e+17,https://www.airbnb.com/rooms/712914393022526732,2.020000e+13,11/09/2022,city scrape,Cheerful 5-bedroom Home Friendly Irish Welcome,Forget your worries in this spacious and seren...,"Fantastic transport routes, 5 mins from luas g...",https://a0.muscache.com/pictures/53ed618c-3c14...,128213034,...,,,,,t,2,0,2,0,
7564,7.130000e+17,https://www.airbnb.com/rooms/712966369602352444,2.020000e+13,12/09/2022,city scrape,Double Bedroom in Private Home,Large double room with panoramic views of Fr. ...,Clongriffin dart station and 15 bus close by. ...,https://a0.muscache.com/pictures/miso/Hosting-...,256847071,...,,,,,t,1,0,1,0,


In [37]:
# dropping empty columns
df.drop(['neighbourhood_group_cleansed', 'bathrooms', 'license'], axis=1, inplace=True)

In [38]:
# Separate categorical and numerical variables 
categorical_vars = df.select_dtypes(include=['object']).columns.tolist() 
numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns.tolist()  

print("Categorical Variables:")
for var in categorical_vars:
    print(var)

print("\nNumerical Variables:")
for var in numerical_vars:
    print(var)

Categorical Variables:
listing_url
last_scraped
source
name
description
neighborhood_overview
picture_url
host_url
host_name
host_since
host_location
host_about
host_response_time
host_response_rate
host_acceptance_rate
host_is_superhost
host_thumbnail_url
host_picture_url
host_neighbourhood
host_verifications
host_has_profile_pic
host_identity_verified
neighbourhood
neighbourhood_cleansed
property_type
room_type
bathrooms_text
amenities
price
has_availability
calendar_last_scraped
first_review
last_review
instant_bookable

Numerical Variables:
id
scrape_id
host_id
host_listings_count
host_total_listings_count
latitude
longitude
accommodates
bedrooms
beds
price_cleaned
minimum_nights
maximum_nights
minimum_minimum_nights
maximum_minimum_nights
minimum_maximum_nights
maximum_maximum_nights
minimum_nights_avg_ntm
maximum_nights_avg_ntm
calendar_updated
availability_30
availability_60
availability_90
availability_365
number_of_reviews
number_of_reviews_ltm
number_of_reviews_l30d
review_sc

In [45]:
# Step 1: Data Cleaning for Location-related Variables
location_variables = ['neighbourhood', 'latitude', 'longitude']
df.loc[:, location_variables] = df.loc[:, location_variables].fillna('Unknown')

# Step 2: Data Cleaning for Price and Review_scores_rating
# Handling outliers using z-score for the 'price_cleaned' column
z_scores = stats.zscore(df['price_cleaned'])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 20)
df = df[filtered_entries]

# Step 3: Data Cleaning for review_scores_rating
mean_review_score = df['review_scores_rating'].mean()

df['review_scores_rating'] = df['review_scores_rating'].fillna(mean_review_score)

In [46]:
# Check for Missing Values
missing_values = df.isnull().sum()
print("Columns with Missing Values:")
print(missing_values[missing_values > 0])

Columns with Missing Values:
description                     155
neighborhood_overview          3370
host_location                  1332
host_about                     4008
host_response_time             3634
host_response_rate             3634
host_acceptance_rate           3375
host_neighbourhood             2331
bathrooms_text                    4
bedrooms                        205
beds                             94
calendar_updated               7562
first_review                   1357
last_review                    1357
review_scores_accuracy         1481
review_scores_cleanliness      1480
review_scores_checkin          1485
review_scores_communication    1481
review_scores_location         1485
review_scores_value            1487
reviews_per_month              1357
dtype: int64


In [41]:
# Drop rows with missing values in the "review_scores_rating" column
df = df.dropna(subset=['review_scores_rating'])

In [42]:
# Check for Missing Values
missing_values = df.isnull().sum()
print("Columns with Missing Values:")
print(missing_values[missing_values > 0])

Columns with Missing Values:
description                     155
neighborhood_overview          3372
host_location                  1334
host_about                     4009
host_response_time             3634
host_response_rate             3634
host_acceptance_rate           3375
host_neighbourhood             2332
bathrooms_text                    4
bedrooms                        205
beds                             94
calendar_updated               7565
first_review                   1357
last_review                    1357
review_scores_accuracy         1481
review_scores_cleanliness      1480
review_scores_checkin          1485
review_scores_communication    1481
review_scores_location         1485
review_scores_value            1487
reviews_per_month              1357
dtype: int64


In [47]:
# Save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_data.csv', index=False)