In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re
import ast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read dataset
pd.set_option('display.max_columns', None)

listing = pd.read_csv('/content/listings.csv')

In [None]:
# 1. Exploratory Data Analysis (EDA)
print("EDA - Basic Info")
print(listing.info())

In [None]:

print(f'This dataset has {listing.shape} rows and columns respectively.')

In [None]:
# Drop rows missing price
listing.dropna(subset = ['price'],inplace=True)

In [None]:
# Check for missing values
print("Missing Values:")
print(listing.isnull().sum())


In [None]:
# Drop columns with more than 20% of values missing

def drop_high_missing(df, threshold=20):

    missing_percentage = df.isnull().sum() / df.shape[0] * 100
    return missing_percentage[missing_percentage>20].index

missing_20_per = drop_high_missing(listing,threshold=20)
print(missing_20_per)

listing_updated = listing.drop(columns=missing_20_per, axis =1)

In [None]:
listing_updated.columns.tolist()

In [None]:
# Since there are too many variables, removing some of them which might not be relevant to the data

columns_to_remove = ['listing_url','scrape_id','last_scraped','source','picture_url','host_url','host_has_profile_pic',
 'host_name','host_picture_url','host_thumbnail_url','calendar_last_scraped','minimum_nights','maximum_nights',
 'minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights',
 'minimum_nights_avg_ntm','maximum_nights_avg_ntm',]
listing_updated.drop(columns= columns_to_remove,axis = 1, inplace = True)

In [None]:
print(f'Updated dataset has {listing_updated.shape} rows and columns respectively.')

In [None]:
# Looking at the kind of data

listing_updated.head()

In [None]:
# Removing text columns, which will be dealt with in part 2

text_columns = ['id','name','description','amenities']


In [None]:
listing_updated.drop(columns=text_columns,axis =1 , inplace= True)

In [None]:
# Converting true or false value to 0 and 1
def process_boolean_columns(df, columns):
    df[columns] = df[columns].replace({'t': 1, 'f': 0}).fillna(2)
    return df

listing_updated = process_boolean_columns(listing_updated, ['host_is_superhost', 'host_identity_verified', 'instant_bookable','has_availability'])

In [None]:
# Removing % from columns to convert it to a number
def remove_percentage_sign(df, columns):
    for col in columns:
        df[col] = df[col].str.rstrip('%').astype(float)
    return df
listing_updated = remove_percentage_sign(listing_updated,['host_response_rate',	'host_acceptance_rate'])

In [None]:
# Cleaning price to convert it to a number

listing_updated['price'] = listing_updated['price'].replace('[\$,]', '', regex=True).astype(float)

In [None]:
# Looking into price value counts to identify outliers/incorrect entries etc.
price_counts = listing_updated['price'].value_counts().sort_values(ascending=False)
price_counts_df = price_counts.reset_index()
price_counts_df.columns = ['price', 'count']

price_counts_df_sorted = price_counts_df.sort_values(by='price', ascending=False)

print(price_counts_df_sorted.head(20))
print(price_counts_df_sorted.tail(20))

In [None]:
# analyse price outliers
upper_threshold = 2500
lower_threshold = 25

filtered_listing = listing_updated[(listing_updated['price'] <= lower_threshold) | (listing_updated['price'] >= upper_threshold)]
print(filtered_listing)

In [None]:
# Clipping the prices to be within the specified range
listing_updated['price'] = listing_updated['price'].clip(lower=lower_threshold, upper=upper_threshold)

In [None]:
# extract year from the date

listing_updated['host_since'] = pd.to_datetime(listing_updated['host_since'],errors = 'coerce')
listing_updated['host_since'] = listing_updated['host_since'].dt.year


In [None]:
listing_updated.describe()

In [None]:
listing_updated.isnull().sum()

In [None]:
# Looking at correlation matrix to identify some relations for imputing missing values
# Correlation matrix
numeric_cols = listing_updated.select_dtypes(include=['number']).columns
correlation_matrix = listing_updated[numeric_cols].corr()

# Plot the heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
#Impute missing values based on the group median.
def impute_by_group_median(df, group_columns, target_column):

    group_median = df.groupby(group_columns)[target_column].median().reset_index()
    group_median.rename(columns={target_column: f'median_{target_column}'}, inplace=True)
    df = df.merge(group_median, on=group_columns, how='left')
    df[target_column] = np.where(df[target_column].isnull(), df[f'median_{target_column}'], df[target_column])
    return df.drop(columns=[f'median_{target_column}'])

In [None]:
# impute missing values
listing_updated = impute_by_group_median(listing_updated,['accommodates','room_type','property_type'],'beds')
listing_updated = impute_by_group_median(listing_updated,['accommodates','room_type','property_type','beds'],'bedrooms')
listing_updated = impute_by_group_median(listing_updated,['accommodates','bedrooms','beds','room_type',],'bathrooms')
listing_updated = impute_by_group_median(listing_updated,['host_is_superhost','host_since','instant_bookable'],'host_acceptance_rate')
listing_updated = impute_by_group_median(listing_updated,['host_acceptance_rate'],'host_response_rate')


In [None]:
listing_updated['host_response_rate'].fillna(0,inplace =True)

In [None]:
# Impute missing bathroom text
# median bathrooms
def impute_by_group_mode(df,group_columns,target_column):

    group_mode = df.groupby(group_columns)[target_column].agg(lambda x:x.mode()[0]).reset_index()
    group_mode.rename(columns={target_column: f'mode_{target_column}'}, inplace=True)
    df = df.merge(group_mode, on=group_columns, how='left')
    df[target_column] = np.where(df[target_column].isnull(), df[f'mode_{target_column}'], df[target_column])
    df = df.drop(columns=[f'mode_{target_column}'])
    return df

# Check the result
listing_updated = impute_by_group_mode(listing_updated,['accommodates', 'room_type', 'bedrooms','beds', 'bathrooms'],'bathrooms_text')
listing_updated = impute_by_group_mode(listing_updated,['host_response_rate'],'host_response_time')

In [None]:
listing_updated.isnull().sum()

In [None]:
# Categorical columns
categorical_columns = listing_updated.select_dtypes(include=['object']).columns
print(categorical_columns)

In [None]:
# Analyse the unique values of catgeorical coulmns

def get_unique_values(df,cols):
  for col in cols:
    print('Unique Values for',col,df[col].unique())

get_unique_values(listing_updated,categorical_columns)


In [None]:
# Grouping the property types into categories
# Define a function to categorize property types
def categorize_property_type(property_type):
    if 'entire' in property_type.lower():
        return 'Entire Place'
    elif 'private room' in property_type.lower():
        return 'Private Room'
    elif 'shared room' in property_type.lower():
        return 'Shared Room'
    elif 'room in' in property_type.lower():
        return 'Room in Building'
    elif 'boat' in property_type.lower() or 'treehouse' in property_type.lower() or 'castle' in property_type.lower() or 'tower' in property_type.lower():
        return 'Unique Stay'
    else:
        return 'Other'

# Apply the function to the property_type column
listing_updated['property_category'] = listing_updated['property_type'].apply(categorize_property_type)

# Display the transformed column
print(listing_updated[['property_type', 'property_category']].head())

listing_updated['property_category'].unique()



In [None]:
listing_updated.head()

In [None]:
listing_updated.describe()

In [None]:
create_histogram(listing_updated,'price',bins=40,color='blue')


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

# Function to plot all categories normally
def plot_categories(column):

    # Create dynamic title
    title = f"Distribution of {column.replace('_', ' ').title()}"

    # Create plot with specified figure size
    plt.figure(figsize=(14, 8))

    # Plotting the categories
    ax = listing_updated[column].value_counts().plot(kind='bar', color='teal')

    # Set title and labels
    plt.title(title)
    plt.xlabel(column.replace('_', ' ').title())
    plt.ylabel('Number of Listings')

    # Rotate x-labels to horizontal
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

    # Format y-axis labels as 'k' format
    ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{x/1000:.1f}k'))

    # Show plot
    plt.show()

# Function to plot top 10 categories
def plot_top_10_categories(column):
    # Create dynamic title
    title = f"Top 10 {column.replace('_', ' ').title()} Distribution"

    # Create plot with specified figure size
    plt.figure(figsize=(14, 8))

    # Plotting the top 10 categories
    ax = listing_updated[column].value_counts().nlargest(10).plot(kind='bar', color='teal')

    # Set title and labels
    plt.title(title)
    plt.xlabel(column.replace('_', ' ').title())
    plt.ylabel('Number of Listings')

    # Rotate x-labels to horizontal
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

    # Format y-axis labels as 'k' format
    ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{x/1000:.1f}k'))

    # Show plot
    plt.show()


In [None]:
plot_categories('host_response_time')


In [None]:
plot_categories('host_verifications')


In [None]:
plot_categories('property_category')


In [None]:
plot_categories('room_type')


In [None]:
plot_top_10_categories('neighbourhood_cleansed')


In [None]:


# Define your custom hex color palette (you can add more or fewer colors as needed)
teal_palette = ['#62BEB6', '#0B9A8D', '#077368', '#abdda4', '#e6f598']  # Teal and related colors

plt.figure(figsize=(10, 6))
listing_updated['neighbourhood_group_cleansed'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=teal_palette)
plt.title('Listings by Neighborhood Group')
plt.ylabel('')  # Hides default y-label for pie chart
plt.show()


In [None]:
# Correlation matrix
numeric_cols = listing_updated.select_dtypes(include=['number']).columns
correlation_matrix = listing_updated[numeric_cols].corr()

# Plot the heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
plt.figure(figsize = [16,9])
sns.scatterplot(x='host_response_time', y='host_response_rate', data=listing_updated)

plt.title('Host_Response_Time vs Host_Response_Rate')
plt.show()



In [None]:
# Function for multivariate analysis of categories

def plot_multi_categories(x,hue):
  plt.figure(figsize = (14,8))
  sns.boxplot(x=x, y = 'price', hue = hue, data = listing_updated, palette = teal_palette)
  title = f"Price Distribution by {x.replace('_', ' ').title()} and {hue.replace('_', ' ').title()}"
  plt.title(title)
  plt.xlabel(x.replace('_', ' ').title())
  plt.ylabel('Price')
  plt.xticks(rotation=45)
  plt.show()

In [None]:
plot_multi_categories('property_category','neighbourhood_group_cleansed')

In [None]:
plot_multi_categories('host_response_time','host_verifications')

In [None]:
plot_multi_categories('neighbourhood_group_cleansed','room_type')

In [None]:
plot_multi_categories('bedrooms','room_type')

In [None]:
# 1. How does host response time vary with property size (accommodates) and room type?
g = sns.FacetGrid(listing_updated, col='room_type', hue='host_response_time', height=5, palette= teal_palette)
g.map(sns.scatterplot, 'host_response_time', 'accommodates')
g.add_legend()
g.set_axis_labels('Host Response Time', 'Number of Guests')
plt.show()

In [None]:
sns.pairplot(listing_updated, vars=['price', 'number_of_reviews'], hue='host_is_superhost', kind='scatter', height=3)
plt.show()


In [None]:
#teal_palette = ['#62BEB6', '#0B9A8D', '#077368', '#abdda4', '#e6f598']  # Teal and related colors
custom_palette = {0.0: '#e6f598', 1.0: '#62BEB6', 2.0:'#077368'}
plt.figure(figsize=(12, 8))
sns.scatterplot(x='host_since', y='price', hue='host_is_superhost', data=listing_updated, palette= custom_palette)
plt.title('Price vs. Host Experience (Superhost Status)')
plt.ylabel('Price')
plt.xlabel('Host Since (Year)')
plt.show()


In [None]:
import folium
from folium.plugins import MarkerCluster

# Create a base map centered around New York City
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

# Create a marker cluster object
marker_cluster = MarkerCluster().add_to(m)

# Loop through each listing and plot it on the map
for index, row in listing_updated.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=row['host_listings_count'],  # Marker size based on host's number of listings
        popup=f"Price: ${row['price']}<br>Host Listings: {row['host_listings_count']}<br>Neighborhood: {row['neighbourhood_cleansed']}",
        color='blue' if row['price'] < 100 else 'green' if row['price'] < 500 else 'red',  # Color based on price range
        fill=True,
        fill_color='blue' if row['price'] < 100 else 'green' if row['price'] < 500 else 'red',
        fill_opacity=0.6
    ).add_to(marker_cluster)

# Show the map inline
m


In [None]:


# Filter the dataset to remove rows with missing coordinates or price
filtered_df = listing_updated.dropna(subset=['latitude', 'longitude', 'price'])

# Set the figure size for the map
plt.figure(figsize=(10, 8))

# Plot a scatter plot with latitudes and longitudes
# Color the dots based on price ranges and use small dots for simplicity
plt.scatter(
    filtered_df['longitude'], filtered_df['latitude'],
    c=filtered_df['price'], s=10, cmap='viridis', alpha=0.6
)

# Add labels and title
plt.title('Airbnb Listings in NYC: Price Distribution by Location')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Add a color bar to show the price scale
plt.colorbar(label='Price ($)')

# Show the plot
plt.show()
