## 01. Import libraries

In [2]:
# Import libraries 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

## 02. Create file path

In [4]:
# Turn Project folder into path
path = r'/Users/tanayawinder/Desktop/Case Study AirBnB'

In [5]:
# Check output
path

'/Users/tanayawinder/Desktop/Case Study AirBnB'

## 03. Import dataset

In [7]:
# Import cleaned version of listing.csv
listing_clean = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','listing_csv_cleaned.pkl'))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tanayawinder/Desktop/Case Study AirBnB\\02 Data\\Prepared Data\\listing_csv_cleaned.pkl'

In [None]:
# Check output
listing_clean.shape

In [None]:
listing_clean.head()

## 04. Further cleaning to prepare data for analysis

In [None]:
# Dropping the 'name' and 'host_name' columns to anonymize, keep only data for analysis
listing_anon = listing_clean.drop(columns=['name', 'host_name', 'number_of_reviews_ltm', 'calculated_host_listings_count'])

In [None]:
# Import cleaned version of listing.gz.csv
gz_clean = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','listing_gz_cleaned.pkl'))

In [None]:
# Creating a new df with only relevant columns from listing.gz
gz_for_merge = gz_clean[['id','host_is_superhost', 'review_scores_rating', 'bathrooms', 'bedrooms', 'accommodates']]

### Merge relevant variables in listing_gz with listing_csv

In [None]:
# Merge dataframes on id
merged = listing_anon.merge(gz_for_merge, left_on='id', right_on='id', how='inner') 

In [None]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

In [None]:
merged.dtypes

In [None]:
# Change the host_is_superhost column to boolean type
merged['host_is_superhost'] = merged['host_is_superhost'].map({'t': True, 'f': False})

## 05. Perform correlation matrix on numerical columns

In [None]:
# Perform correlation matrix on numerical columns
correlation_matrix = merged.select_dtypes(include=['float64', 'int64']).corr()
correlation_matrix

### Create heatmap from correalation

In [None]:
# Assuming listing_clean is your DataFrame and you've already dropped the columns
correlation_matrix = merged.select_dtypes(include=['float64', 'int64']).corr()

# Create a colorblind-friendly heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='cividis', square=True, cbar_kws={"shrink": .8}, linecolor='black', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

### Explore correlations

In [None]:
correlation_matrix['price'].sort_values(ascending=False)

In [None]:
correlation_matrix['review_scores_rating'].sort_values(ascending=False)

## 06. Create scatterplot

In [None]:
# Create a scatterplot for the "house age" and "price per unit" columns in seaborn

sns.lmplot(x = 'accommodates', y = 'price', data = merged)

### Explore outliers

In [None]:
# Explore outlier

highest_price_row = merged.loc[merged['price'].idxmax()]
highest_price_row

In [None]:
merged.shape

In [None]:
# Find the index of the row with the maximum price
max_price_index = merged['price'].idxmax()

# Drop the row with the maximum price
merged_dropped = merged.drop(max_price_index)

# Check output
merged_dropped.shape

In [None]:
# Create a scatterplot for the "house age" and "price per unit" columns in seaborn

sns.lmplot(x = 'accommodates', y = 'price', data = merged_dropped)

In [None]:
# Retrieve the top five rows with the highest prices
top_five_max_price_rows = merged_dropped.nlargest(5, 'price')

# Display the top five rows
top_five_max_price_rows

In [None]:
# Filter the DataFrame for the "soho" neighborhood
soho_neighborhood = merged_dropped[merged_dropped['neighbourhood'] == 'SoHo']

# Calculate the average price in the "soho" neighborhood
average_price_soho = soho_neighborhood['price'].mean()

# Display the average price
print(f"The average price in the Soho neighborhood is: {average_price_soho}")

These three are also outliers to be removed. 

In [None]:
# Find the indices of the top three rows with the highest prices
top_three_max_price_indices = merged_dropped.nlargest(3, 'price').index

# Drop the top three rows with the highest prices using the indices
merged_dropped = merged_dropped.drop(top_three_max_price_indices)

In [None]:
merged_dropped.shape

### Create scatterplots

In [None]:
# Create a scatterplot for the "house age" and "price per unit" columns in seaborn

sns.lmplot(x = 'accommodates', y = 'price', data = merged_dropped)

In [None]:
# Creating a new DataFrame with selected columns
selected_columns_df = merged_dropped[['accommodates', 'bathrooms', 'price']]

In [None]:
# Dropping rows where the price is 7000 or more
filtered_df = selected_columns_df[selected_columns_df['price'] < 7000]

In [None]:
# Create a scatterplot for the "house age" and "price per unit" columns in seaborn

sns.lmplot(x = 'accommodates', y = 'price', data = filtered_df)

The scatter plot indicates that as the number of guests a property can accommodate increases, the price also tends to increase. This is evidenced by the slight upward trend in the plot, although the relationship appears to be weak to moderate at best. 

In [None]:
sns.lmplot(x = 'bathrooms', y = 'price', data = filtered_df)

In [None]:
# Drop rows where the number of bathrooms is greater than 10
filtered_df = filtered_df[filtered_df['bathrooms'] <= 10]  # Keep only rows with 10 or fewer bathrooms

In [None]:
sns.lmplot(x = 'bathrooms', y = 'price', data = filtered_df)

Positive Trend: The plot shows a positive trend, meaning that as the number of bathrooms increases, the price of the property tends to increase as well. This is indicated by the upward slope of the line that fits the data points. Practically speaking, more bathrooms likely means a higher-end rental space. 

## 07. Create pair plot

In [None]:
# Dropping rows where the price is 7000 or more
merged_dropped = merged_dropped [merged_dropped ['price'] < 7000]

In [None]:
# Drop rows where the number of bathrooms is greater than 10
merged_dropped = merged_dropped[merged_dropped['bathrooms'] <= 10]  # Keep only rows with 10 or fewer bathrooms

In [None]:
merged_without_bool = merged_dropped.drop(columns=['host_is_superhost', 'latitude', 'longitude', 'id', 'host_id'])

In [None]:
# Perform correlation matrix on numerical columns
correlation_matrix = merged_dropped.select_dtypes(include=['float64', 'int64']).corr()
correlation_matrix

In [None]:
# Assuming listing_clean is your DataFrame and you've already dropped the columns
correlation_matrix_2 = merged_without_bool.select_dtypes(include=['float64', 'int64']).corr()

# Create a colorblind-friendly heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_2, annot=True, fmt=".2f", cmap='cividis', square=True, cbar_kws={"shrink": .8}, linecolor='black', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
correlation_matrix_2['price'].sort_values(ascending=False)

In [None]:
correlation_matrix_2['review_scores_rating'].sort_values(ascending=False)

#### Positive Relationships:

accommodates (0.464984): This is the strongest positive correlation, meaning that as the number of guests a property can accommodate increases, the price also tends to increase. Larger properties typically cost more.

bathrooms (0.311114): More bathrooms generally lead to higher prices, indicating that properties with more bathrooms are likely more spacious or luxurious.

bedrooms (0.248679): Similar to bathrooms, more bedrooms correlate with higher prices, though the relationship is not as strong.
    
availability_365 (0.065684): This weak positive correlation suggests that properties available more days of the year might have slightly higher prices, but the effect is small.

review_scores_rating (0.057604): Higher ratings are associated with slightly higher prices, but the relationship is very weak.

#### Negative Relationships:

number_of_reviews (-0.055883): Thereâ€™s a weak negative correlation, indicating that properties with more reviews might have slightly lower prices. This could be because more affordable properties tend to get booked (and reviewed) more often.
    
minimum_nights (-0.084870): Properties with higher minimum stay requirements tend to have slightly lower prices. Hosts might lower prices to attract longer stays, but this effect is also weak.

#### No Relationship:

reviews_per_month (0.011632): This correlation is almost zero, indicating that the frequency of reviews does not significantly impact pricing.

In [None]:
g = sns.pairplot(merged_without_bool)

# Overlay regression lines on the scatter plots
for ax in g.axes.flatten():
    # Check if the axes are not empty
    if ax.collections:
        # Get the x and y data from the scatter plot
        x_data = ax.collections[0].get_offsets()[:, 0]
        y_data = ax.collections[0].get_offsets()[:, 1]
        
        # Fit a linear regression line
        sns.regplot(x=x_data, y=y_data, ax=ax, scatter=False, color='red')

# Show the plot
plt.show()

## 08. Categorical plot

In [None]:
# Use a histogram to visualize the distribution of the variables. 
# This way, you can determine sensible categories for the price ranges. 
# You don't want to end up with too few observations in any one of the categories.
# The argument "kde" add a line that encompasses the distribution

# Assuming merged_dropped is your DataFrame
top_neighbourhoods = merged_dropped['neighbourhood'].value_counts().nlargest(10).index
filtered_data = merged_dropped[merged_dropped['neighbourhood'].isin(top_neighbourhoods)]

sns.histplot(filtered_data['neighbourhood'], bins=20, kde=True)
plt.title('Top 10 Neighbourhoods Histogram')
plt.xlabel('Neighbourhood')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

Top 10 neighborhoods split evenly between Manhattan and Brooklyn.

In [None]:
sns.histplot(merged_dropped['neighbourhood_group'], bins = 20, kde = True)

In [None]:
sns.histplot(merged_dropped['room_type'], bins = 20, kde = True)

In [None]:
# Filter the data to include only prices up to 2000
filtered_prices = merged_dropped[merged_dropped['price'] <= 2000]

sns.histplot(filtered_prices['price'], bins=20, kde=True)
plt.title('Price Distribution Histogram')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
merged_dropped.describe()

In [None]:
merged_dropped.shape

In [None]:
# fig = g.get_figure()
g.savefig("out.png") 

# Again, the image will be saved in the working directory. 

### Variables to Explore Further:


#### Categorical:
price vs neighborhoods, neighborhood groups: this will provide valueable information on how each burrough and then which neighborhoods withinin each burrough are listed.
 
price vs room type: this will provide key inights such as whether location or type of rental have more effect on pricing.

#### Numerical:

price vs. accommodates, bathrooms, bedrooms: These variables show a clear positive trend with price, but the relationships are not strong. Exploring these further could reveal more about the specific factors driving higher prices, such as location or luxury features.

minimum_nights: The wide range in minimum nights and its weak relationship with price suggests there may be interesting patterns based on property type or location. Further analysis could uncover whether longer minimum stays are more common in certain markets or for specific property types.

availability_365: The uniform distribution suggests that some properties are available year-round, while others are not. Exploring this in relation to other factors (like price or reviews) could reveal strategies for maximizing bookings.
                                                                                                                                                                                                          

#### Questions

What factors drive price? 

Are there any noticable seasonal trends? 

How much to ratings affect listings?

#### Hypothesis

Factors such as neighborhood group, size and type of rental property have the most effect on prices. Most lsitings will be around holidays New Years. 