# Price prediction model of AIRBNB dataset

Airbnb is an online marketplace for arranging or offering homestays, or tourism experiences

###### Import the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

###### Importing the dataset

In [None]:
df=pd.read_csv("C:/Users/muska/Desktop/Data warehousing Project/AB_NYC_2019.csv")

###### Shape of the dataset

In [None]:
df.shape

# Descriptive analysis of the data 

Descriptive, which answers the question, “What happened?
It is used to summarize and explore the behavior of the data involved in the study


In [None]:
all_columns = df.columns
print("All the columns in the DataFrame:")
for column in all_columns:
    print(column)

##### Checking the variable types 
These are divided into two types:

1.Numeric data - Continous data ,Discrete data

2.Categorical data - Ordinal data ,Nominal data

###### Categorical Data 

In [None]:
print("This is the CATEGORICAL DATA columns")
print("----------------------------------------")
unique_neighbourhood_group = df['neighbourhood_group'].unique()
print("1.Neighbourhood groups:")
for group in unique_neighbourhood_group:
    print(group)
unique_neighbourhood = df['neighbourhood'].unique()
unique_room_type = df['room_type'].unique()
print("----------------------------------------")
# Print the unique values in a representative way
print("2.Neighbourhoods:")
for neighborhood in unique_neighbourhood:
    print(neighborhood)
print("----------------------------------------")
print("3.Room types:")
for room_type in unique_room_type:
    print(room_type)

###### Numerical Data

In [None]:
numerical_cols = df.select_dtypes(include='number').columns

# Separating discrete and continuous numerical columns
discrete_numerical_cols = ['minimum_nights', 'calculated_host_listings_count', 'price']
continuous_numerical_cols = [col for col in numerical_cols if col not in discrete_numerical_cols]

print("This is the NUMERICAL DATA columns")
print("----------------------------------------")
print("Discrete Numerical Data:")
for column in discrete_numerical_cols:
    unique_values = df[column].unique()
    unique_values_str = ', '.join(map(str, unique_values))
    print(f"{column} = {unique_values_str}")

    print("----------------------------------------")

# Print continuous numerical columns
print("Continuous Numerical Data:")
for column in continuous_numerical_cols:
    if column == 'longitude':
        min_value = df[column].min()
        max_value = df[column].max()
        print(f"{column}: Min = {min_value}, Max = {max_value}")
    elif column == 'latitude':
        min_value = df[column].min()
        max_value = df[column].max()
        print(f"{column}: Min = {min_value}, Max = {max_value}")  


###### Frequency Distribution`
The frequency distribution is an arrangement of values that one or more variables take in a sample. Each entry in the table contains the frequency or count of occurrences of values within a specific group or range, and so the table summarizes the distribution of the sample values.

The technique used to create the Percent column uses the following formula:

p -> Percent

Freq_x -> Frequency of an element x

Total freq-> Sum of all frequencies

p=100* freq_x/total freq

In [None]:
freq_area = df['neighbourhood_group'].value_counts().reset_index()
freq_area.columns = ['neighbourhood_group', 'Frequency']
freq_area['Percent'] = (freq_area['Frequency'] / len(df)) * 100

# Sort the DataFrame by 'Frequency'
freq_area = freq_area.sort_values(by='Frequency')

# Reset the index
freq_area = freq_area.reset_index(drop=True)
print(freq_area)

In [None]:
freq_area = df['neighbourhood'].value_counts().reset_index()
freq_area.columns = ['neighbourhood', 'Frequency']
freq_area['Percent'] = (freq_area['Frequency'] / len(df)) * 100

# Sort the DataFrame by 'Frequency'
freq_area = freq_area.sort_values(by='Frequency')

# Reset the index
freq_area = freq_area.reset_index(drop=True)

print(freq_area)


In [None]:
room_type = df['room_type'].value_counts().reset_index()
room_type.columns = ['room_type', 'Frequency']
room_type['Percent'] = (room_type['Frequency'] / len(df)) * 100

# Sort the DataFrame by 'Frequency'
room_type = room_type.sort_values(by='Frequency')

# Reset the index
room_type = room_type.reset_index(drop=True)

print(room_type)

###### CONCLUSION FROM THE FREQUENCY DISTRIBUTION

Neighbourhood_group/Location

1° Manhattan -> 21661(44.30%)
2° Brooklyn -> 20104(41.11%)
3° Queens -> 5666(11.58%)

neighbourhood/Area

1° Williamsburg -> 3920(8.01%)
2° Bedford-Stuyvesant -> 3714(7.59%)
3° Harlem -> 2658(5.43%)

room_type/listing space type

1° Entire home/apt -> 25409(51.96%)
2° Private room -> 22326(45.66%)
3° Shared room -> 1160(2.37%)



###### Now we predict and check the effect of different factors on the price predicting the price 

In [None]:
df.head()

In [None]:
df.describe(include="all")

In [None]:
df.info()

###### Checking Null Values

In [None]:
df.isnull().sum()

In [None]:
def nulls_summary_table(df):
    null_values = pd.DataFrame(df.isnull().sum())
    null_values[1] = null_values[0]/len(df)
    null_values.columns = ['null_count','null_pct']
    return null_values
nulls_summary_table(df)

###### Drop the unneccesary columns

In [None]:
df_copy=df
df_copy.drop(['name','id','host_name','last_review'], axis=1, inplace=True)

###### check the changes

In [None]:
df_copy.head()

In [None]:
#replacing all NaN values in 'reviews_per_month' with 0

df_copy.fillna({'reviews_per_month':0}, inplace=True)

In [None]:
#examing changes 
# df.reviews_per_month.isnull().sum()
df_copy.isnull().sum().sum()  #this is for overall 

###### Remove the NaN values from the dataset

In [None]:
df_copy.isnull().sum()
df_copy.dropna(how='any',inplace=True)

###### Lets finally check the Result

In [None]:
df_copy.info() 
# all the values are non null now

In [None]:
#  Removing where the price is equal to 0 
df_copy = df_copy[df_copy['price'] != 0]

In [None]:
df_copy.head()
zero_price_count = df_copy[df_copy['price'] == 0].count()['price']
print(zero_price_count)  #no value with the price 0 

## Data visualizations

######  Heatmap -> Get Correlation between different variables 

In [None]:
corr =df_copy.corr(method='kendall')
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)
df.columns

##### Checking the effect of variables on price (COMPARE THE EFFECT ON THE MODEL BY OTHER VARIABLE AND CHECKING WHICH IS IMP PARAMETER TO DESCRIBE THE MODEL)

###### Neighbourhood Group

In [None]:
sns.countplot(data=df_copy, x='neighbourhood_group')
plt.title('Neighbourhood Group')

# Set the figure size
fig = plt.gcf()
fig.set_size_inches(5, 5)
# Show the plot
plt.show()

print("After this we check the relation of the price w.r.t to neighbourhood groups ")

median_color = "red"

plt.figure(figsize=(12, 6))
# Create the box plot with custom styling
ax = sns.boxplot(data=df, x='neighbourhood_group', y='price',
            boxprops={'edgecolor': median_color},  # Highlight the median line
            flierprops={'marker': 'o', 'markerfacecolor': 'purple', 'markeredgecolor': 'purple'},  # Add outline curve for skewness
            medianprops={'color': median_color, 'linewidth': 2}  # Highlight the median line and make it bold
           )
plt.title('Price Distribution by Neighbourhood Group')
plt.xticks(rotation=45)  # To rotate x-axis labels for better visibility

# Highlight the median value with custom text
medians = df.groupby(['neighbourhood_group'])['price'].median()
for xtick in ax.get_xticks():
    ax.text(xtick, medians[xtick], f'{medians[xtick]:.2f}', horizontalalignment='center', color='black', weight='bold')

plt.show()

import matplotlib.pyplot as plt
import seaborn as sns




In [None]:
import folium
import pandas as pd
import branca.colormap as cm

# Create a Folium map centered around New York City
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

# Filter the data for the desired price range
filtered_price = df[(df['price'] <= 1000) & (df['price'] <= 10000)]

# Define a color scale for price
price_color_scale = cm.LinearColormap(['green', 'yellow', 'red'], vmin=filtered_price['price'].min(), vmax=filtered_price['price'].max())

# Loop through the DataFrame and add data points to the map
for index, row in filtered_price.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        color=price_color_scale(row['price']),  # Color based on 'price'
        fill=True,
        fill_color=price_color_scale(row['price']),  # Fill color based on 'price'
        fill_opacity=0.6,
        popup=f"Price: ${row['price']:.2f}<br>Neighborhood: {row['neighbourhood_group']}",
    ).add_to(m)

# Add the price color scale to the map
price_color_scale.add_to(m)

# Display the map inside the Jupyter Notebook
m


###### Room type vs Price 

In [None]:
sns.countplot(df_copy['room_type'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(5,5)
plt.show()

print("Lets compare the room type effect on the price")

plt.figure(figsize=(15,12))
sns.scatterplot(x='room_type', y='price', data=df)

plt.xlabel("Room Type", size=13)
plt.ylabel("Price", size=13)
plt.title("Room Type vs Price",size=15, weight='bold')
plt.show()

print("This shows that the shared room is having less price.Entire room is having the highest followd by private room ")


###### room type vs price based on neighourhood

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle("Room Type vs Price vs Neighbourhood Group", fontsize=16, fontweight='bold')

# Define room types
room_types = df_copy['room_type'].unique()

# Create a scatter plot for each room type
for i, room_type in enumerate(room_types):
    row = i // 2
    col = i % 2
    ax = axes[row, col]

    # Filter the data for the specific room type
    data_group = df_copy[df_copy['room_type'] == room_type]

    sns.scatterplot(x="neighbourhood_group", y="price", hue="neighbourhood_group",
                    size="neighbourhood_group", sizes=(50, 200),
                    palette="Dark2", data=data_group, ax=ax)

    ax.set_title(f"Room Type: {room_type}")
    ax.set_xlabel("Neighbourhood Group")
    ax.set_ylabel("Price")

# Adjust subplot layout
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()


###### price vs number of reveiw 

In [None]:
plt.figure(figsize=(20,20))
sns.set_palette("Set1")

sns.lineplot(x='price', y='number_of_reviews', 
             data=df_copy[df_copy['neighbourhood_group']=='Brooklyn'],
             label='Brooklyn')
sns.lineplot(x='price', y='number_of_reviews', 
             data=df_copy[df_copy['neighbourhood_group']=='Manhattan'],
             label='Manhattan')
sns.lineplot(x='price', y='number_of_reviews', 
             data=df_copy[df_copy['neighbourhood_group']=='Queens'],
             label='Queens')
sns.lineplot(x='price', y='number_of_reviews', 
             data=df_copy[df_copy['neighbourhood_group']=='Staten Island'],
             label='Staten Island')
sns.lineplot(x='price', y='number_of_reviews', 
             data=df_copy[df_copy['neighbourhood_group']=='Bronx'],
             label='Bronx')
plt.xlabel("Price", size=13)
plt.ylabel("Number of Reviews", size=13)
plt.title("Price vs Number of Reviews vs Neighbourhood Group",size=15, weight='bold')
plt.show()

# low price high review

###### price vs availability_365 

In [None]:
# import folium
# import pandas as pd
# import branca.colormap as cm
# # Assuming 'df' is your DataFrame with 'latitude', 'longitude', 'price', and 'availability_365' columns

# # Create a Folium map centered around New York City
# m = folium.Map(location=[40.7128, -74.0060], zom_start=11)

# # Define a custom colormap with a wider range of colors
# colors = ['lightblue', 'blue', 'darkblue', 'purple', 'darkred', 'red', 'orange', 'yellow', 'green', 'darkgreen']
# price_min = df_copy['price'].min()
# price_max = df_copy['price'].max()
# colormap = cm.LinearColormap(colors=colors, vmin=price_min, vmax=price_max)

# # Loop through the DataFrame and add data points to the map
# for index, row in df.iterrows():
#     folium.CircleMarker(
#         location=[row['latitude'], row['longitude']],
#         radius=5,
#         color=colormap(row['price']),  # Color based on 'price'
#         fill=True,
#         fill_color=colormap(row['price']),  # Fill color based on 'price'
#         fill_opacity=0.6,
#         popup=f"Price: ${row['price']:.2f}, Availability: {row['availability_365']}"
#     ).add_to(m)

# # Add the color legend to the map
# colormap.add_to(m)

# # Display the map inside the Jupyter Notebook
# m


DATA MODELLING

In [None]:
df_copy.head()

host listing count affect the price 
lets go a little but deep inside why we are thinking in this way this is because host listing count means the number of houses owned by that host this seems to be factor not to be considered but if we think a host with more number of property is having more experienced better is the chance that the person is use to his work lets compare the host lisitings count wiht the number of reveiw 
lets see does more experienced means more price

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
plt.scatter(df['calculated_host_listings_count'], df['price'], alpha=0.5)  # Alpha controls point transparency
plt.title('Price vs. Host Listing Count')
plt.xlabel('Host Listing Count')
plt.ylabel('Price')
plt.grid(True)

plt.show()

# no significnat effect shown 


In [None]:

from scipy.stats import norm
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from sklearn.metrics import r2_score

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(df_copy['price'], fit=norm)
plt.title("Price Distribution Plot",size=15, weight='bold')

In [None]:
df_copy['price_log'] = np.log(df_copy.price+1)

In [None]:
plt.figure(figsize=(12,10))
sns.distplot(df_copy['price_log'], fit=norm)
plt.title("Log-Price Distribution Plot",size=15, weight='bold')

df.head()

In [None]:
df.head()

In [None]:
df_copy.info()

In [None]:
df_copy.drop(['host_id','latitude','longitude','neighbourhood','number_of_reviews','reviews_per_month'], axis=1, inplace=True)

In [None]:
# plt.figure(figsize=(7,7))
# stats.probplot(df_copy['price_log'], plot=plt)
# plt.show()

In [None]:
df_copy.isnull().sum()

###### checking Multicollinearity

In [None]:
multicollinearity, V=np.linalg.eig(corr)
multicollinearity

###### Encoding the data

In [None]:
def Encode(df_copy):
    for column in df_copy.columns[df_copy.columns.isin(['neighbourhood_group','room_type'])]:
        df_copy[column] = df_copy[column].factorize()[0]
    return df_copy
df_copy_en = Encode(df_copy.copy())

In [None]:
df_copy_en.head()

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score



# Define independent and dependent variables
x = df_copy_en.iloc[:, [0, 1, 3, 4, 5]]
y = df_copy_en['price_log']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=353)

# Display a sample of the training data
print("Sample of x_train:")
print(x_train.head())
print("\nSample of y_train:")
print(y_train.head())

In [None]:
# Create and train a Linear Regression model
reg = LinearRegression()
reg.fit(x_train, y_train)

# Make predictions using the Linear Regression model
y_pred_linear = reg.predict(x_test)


# Calculate and print the R-squared score for Linear Regression
linear_regression_r2 = r2_score(y_test, y_pred_linear)
print("\nR-squared Score for Linear Regression:", linear_regression_r2)
# Calculate RMSE for Linear Regression
linear_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print("RMSE for Linear Regression:", linear_rmse)

# Calculate MAE for Linear Regression
linear_mae = mean_absolute_error(y_test, y_pred_linear)
print("MAE for Linear Regression:", linear_mae)


# Create a DataFrame to display actual vs predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_linear})

# Print the DataFrame
print(comparison_df)


In [None]:
# Split the data into training and testing sets for Decision Tree Regression

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=105)

# Create and train a Decision Tree Regression model
DTree = DecisionTreeRegressor(min_samples_leaf=0.0001)
DTree.fit(x_train, y_train)

# Make predictions using the Decision Tree Regression model
y_pred_tree = DTree.predict(x_test)

# Calculate and print the R-squared score for Decision Tree Regression
decision_tree_r2 = r2_score(y_test, y_pred_tree)
print("\nR-squared Score for Decision Tree Regression:", decision_tree_r2)
tree_rmse = np.sqrt(mean_squared_error(y_test, y_pred_tree))
print("RMSE for Decision Tree Regression:", tree_rmse)

# Calculate MAE for Decision Tree Regression
tree_mae = mean_absolute_error(y_test, y_pred_tree)
print("MAE for Decision Tree Regression:", tree_mae)

# Create a DataFrame to display actual vs predicted values
comparison_df_tree = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_tree})

# Print the DataFrame
print(comparison_df_tree)

In [None]:
# Create and train a Lasso Regression model
lasso = Lasso()
lasso.fit(x_train, y_train)

# Make predictions using the Lasso Regression model
y_pred_lasso = lasso.predict(x_test)

# Calculate and print the R-squared score for Lasso Regression
lasso_r2 = r2_score(y_test, y_pred_lasso)
print("\nR-squared Score for Lasso Regression:", lasso_r2)

lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print("RMSE for Lasso Regression:", lasso_rmse)

# Calculate MAE for Lasso Regression
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
print("MAE for Lasso Regression:", lasso_mae)

# Create a DataFrame to display actual vs predicted values
comparison_df_tree = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lasso})

# Print the DataFrame
print(comparison_df_tree)

In [None]:
# Create and train a Ridge Regression model
ridge = Ridge()
ridge.fit(x_train, y_train)

# Make predictions using the Ridge Regression model
y_pred_ridge = ridge.predict(x_test)

# Calculate and print the R-squared score for Ridge Regression
ridge_r2 = r2_score(y_test, y_pred_ridge)
print("\nR-squared Score for Ridge Regression:", ridge_r2)
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print("RMSE for Ridge Regression:", ridge_rmse)
# Calculate MAE
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
print("MAE for Ridge Regression:", ridge_mae)

# Create a DataFrame to display actual vs predicted values
comparison_df_tree = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_ridge})

# Print the DataFrame
print(comparison_df_tree)


In [None]:
import matplotlib.pyplot as plt 

plt.scatter(df['price_log'],df['neighbourhood'])
plt.xlabel("CGPA")
plt.ylabel("package(in lpa)")

In [None]:
# 