In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:navajowhite;">Importing Necessary Libraries</h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from yellowbrick.target import BalancedBinningReference
from yellowbrick.regressor import PredictionError, ResidualsPlot
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")

<h1 style="background-color:navajowhite;">Data Analysis and Visualization</h1>

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
df.isnull().sum().to_frame()

In [None]:
df["reviews_per_month"].fillna(df.reviews_per_month.mode()[0], inplace=True)

<h3 style="color:green">Now, let's dive into the some columns and try to understand the data better.

<h3 style="color:red">1-Host Id

<p style="color:green">Airbnb is a platform that connects people who want to rent out their homes with people who are looking for accommodations in that locale. So, from "host_id" column, we can easily analyze the hosts who share the most posts.

In [None]:
plt.figure(figsize = (10,6), dpi=300)
df["host_id"].value_counts().head().sort_values().plot(kind = "barh", color = "goldenrod", 
                                                       hatch="+", edgecolor="blue")
plt.xlabel("Number of Post", size = 14)
plt.ylabel("Host ID", size = 14)
plt.title("Top 5 Hosts With Most Posts", size = 18)

<p style="color:green">From which neighborhoods did "host 219517861" share the most posts?

In [None]:
plt.figure(figsize = (10,6), dpi=300)
df.loc[df['host_id'] == 219517861]["neighbourhood"].value_counts().plot(kind = "bar", 
                                                                        figsize=(10,6), color = "skyblue",
                                                                       hatch="+", edgecolor="blue")
plt.xlabel("Neighbourhood", size = 14)
plt.ylabel("Number of Post", size = 14)
plt.title("Neighborhoods that 'host 219517861' shares the most", size = 18)

<h3 style="color:red">2-Neighbourhood Group

<p style="color:green">Let's check how many distinct values we have.

In [None]:
print("There are", df["neighbourhood_group"].nunique(), "distinct values.")

In [None]:
cumulative = round(df["neighbourhood_group"].value_counts(normalize=True).cumsum()*100, 2)
plt.figure(figsize = (10,6), dpi=300)
ax1 = df["neighbourhood_group"].value_counts().sort_values(ascending=False).plot(kind = "bar", 
                                                                                 color = "lightcoral",
                                                                                hatch="+", edgecolor="green")
ax2 = cumulative.plot(kind="line", color="blue", secondary_y=True,marker="o")
ax1.set_xlabel("Neighbourhood Group",  size = 14)
ax1.set_ylabel("Count", size = 14)
ax2.set_ylabel("Cumulative (%)", size = 14)
ax2.legend(["Cumulative"],loc=9)
plt.title("Neighbourhood Groups", size = 18)

 <p style="color:green">As it's seen on the graph, "neighbourhood_group" column is largely made up of Manhattan and Brooklyn.

 <p style="color:green">Let's check the average, min and max prices for each neighbourhood group!

In [None]:
plt.figure(figsize = (10,6))
df.groupby("neighbourhood_group")["price"].agg(["mean","max", "min"]).style.highlight_min(subset=["min"],
                                                                                          color = 'tomato',
                                                                                          axis = 0)

 <p style="color:green">There are some data with $0. Let's drop them.

In [None]:
print("The price column contains",df.loc[df["price"] == 0].shape[0], "data with 0 values.")

In [None]:
df = df[df["price"] != 0]

In [None]:
plt.figure(figsize = (10,6), dpi=300)
df.groupby("neighbourhood_group")["price"].agg("mean").sort_values().plot(kind = "barh", color = "aliceblue",
                                                                                hatch="+", edgecolor="green")
plt.xlabel("Avg. Price($)", size = 14)
plt.ylabel("Neighbourhood Group", size = 14)
plt.title("Average Price of Neighbourhood Groups", size = 18)

 <p style="color:green">Average Price in Manhattan is pretty higher than other neighbourhood groups.

<h3 style="color:red">3-Neighbourhood

<p style="color:green">Let's check how many distinct values we have.

In [None]:
print("There are", df["neighbourhood"].nunique(), "distinct values.")

In [None]:
plt.figure(figsize = (10,6), dpi=300)
df["neighbourhood"].value_counts().head(10).sort_values().plot(kind = "barh", color = "seashell",
                                                                        hatch="/", edgecolor="green")
plt.xlabel("Count", size = 14)
plt.ylabel("Neighbourhood", size = 14)
plt.title("Top 10 Neighbourhood", size = 18)

In [None]:
plt.figure(figsize = (10,6), dpi=300)
top_20_n_p_mean_df = df.groupby("neighbourhood")["price"].agg("mean").sort_values(ascending=False).head(20)
top_20_n_p_mean_df.sort_values(ascending=False).plot(kind = "bar", color = "violet",
                                                                        hatch="/", edgecolor="green")
plt.xlabel("Neighbourhood", size = 14)
plt.ylabel("Avg. Price($)", size = 14)
plt.title("Top 20 Neighbourhood With Highest Avg. Prices", size = 18)

<h3 style="color:red">4-Latitude and Longitude

In [None]:
plt.figure(figsize = (10,6), dpi=300)
sns.scatterplot(data=df, x="longitude", y="latitude", hue="neighbourhood_group")
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title("Distribution of Neighbourhood Group with Respect to Latitude and Longitude", fontsize=18)
plt.legend(prop={"size":12})

In [None]:
plt.figure(figsize = (10,6), dpi=300)
sns.scatterplot(data = df, x="longitude", y="latitude", hue="room_type")
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title("Distribution of Room Type with Respect to Latitude and Longitude", fontsize=18)
plt.legend(prop={"size":12})

<h3 style="color:red">5-Room Type

In [None]:
cumulative = df["room_type"].value_counts(normalize=True).cumsum()*100
plt.figure(figsize = (10,6), dpi=300)
ax1 = df["room_type"].value_counts().sort_values(ascending=False).plot(kind = "bar", color = "lemonchiffon",
                                                                hatch="x", edgecolor="plum")
ax2 = cumulative.plot(kind="line", color="deeppink", secondary_y=True, marker="o")
ax1.set_xlabel("Room Type", size = 14)
ax1.set_ylabel("Count", size = 14)
ax2.set_ylabel("Cumulative (%)", size = 14)
ax2.legend(["Cumulative"],loc=9)
plt.title("Room Types", size = 18)

<p style="color:green">In room type column, only 2.36% of our data belongs to the shared room.

In [None]:
plt.figure(figsize = (10,6), dpi=300)
r_p_mean_df = df.groupby("room_type")["price"].agg("mean")
r_p_mean_df.sort_values(ascending=True).plot(kind = "barh", color = "orange",
                                                                hatch="x", edgecolor="black")
plt.xlabel("Avg. Price($)", size = 14)
plt.ylabel("Room Type", size = 14)
plt.title("Room Types With Avg. Prices", size = 18)

<p style="color:green">Mean Price of Entire home/apt is almost twice of the others.

<h3 style="color:red">6-Price

<p style="color:green">Price will be our target column in this project. So, let's check the distribution of price.

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16,8), dpi=300)
sns.histplot(data=df, x="price", kde=True, ax=ax[0]).set_title("Price Distribution Before Log Transformation",
                                                                size = 16)
sns.histplot(np.log1p(df.price), kde=True, ax=ax[1]).set_title("Price Distribution After Log Transformation",
                                                                size = 16)

<p style="color:green">The graph on the left shows that there is a right skewed distribution. To make a better statisticial analysis and to get better scores, we will be applied log transformation for the price column.

<h1 style="background-color:navajowhite;">Detection of Outliers</h1>

In [None]:
columns = ["price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count",
          "availability_365"]
fig = plt.figure(figsize=(16,8))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for num, column_name in enumerate(columns):
    ax = fig.add_subplot(2, 3, num +1)
    ax = sns.boxplot(x=df[column_name], color='skyblue')

In [None]:
q1_price = df["price"].quantile(0.25)
q3_price = df["price"].quantile(0.75)
iqr_price = q3_price - q1_price
lower_limit_price = q1_price - 1.5 * iqr_price
upper_limit_price = q3_price + 1.5 * iqr_price

df_filter_price = df[(df["price"] > lower_limit_price) & (df["price"] < upper_limit_price)]

df = df_filter_price

<h1 style="background-color:navajowhite;">Correlation Matrix</h1>

In [None]:
plt.figure(figsize=(20,10), dpi=300)
df_corr = df.corr()
mask = np.triu(np.ones_like(df_corr, dtype=bool))
sns.heatmap(df_corr, mask=mask, annot=True, cmap="Oranges")
plt.tight_layout()

<h1 style="background-color:navajowhite;">Data Preparation</h1>

<p style="color:green">Let's apply log transformation for the price column.

In [None]:
df["price"] = np.log1p(df["price"])

In [None]:
plt.figure(figsize=(12,8), dpi=300)
balancedbinning = BalancedBinningReference()
balancedbinning.fit(df["price"])
balancedbinning.show()

<p style="color:green">Let's drop the columns that will not affect the price prediction.

In [None]:
df.drop(["id", "name", "host_id", "host_name", "last_review"], axis = 1, inplace = True)

<h2 style="background-color:navajowhite;">Encoding</h2>

In [None]:
df = pd.get_dummies(df, columns=["neighbourhood_group", "neighbourhood", "room_type"], 
                          prefix=["ng", "n", "rt"], drop_first=True)

In [None]:
X = df.drop("price", axis = 1)
y = df["price"]

<h2 style="background-color:navajowhite;">Feature Scaling</h2>

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

<h2 style="background-color:navajowhite;">Splitting Data as Train Data and Test Data</h2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

<h1 style="background-color:navajowhite;">Model Building and Comparison</h1>

<p style="color:green" >Lasso Regression, Ridge Regression, Decision Tree Regressor and Random Forest Regressor models will be built in the model building section.

In [None]:
def models(X_train, X_test, y_train, y_test):
    
    models = pd.DataFrame(columns=["Model","Test Sc.","Train Sc.","MAE","MSE","RMSE", "RMSE CV"])
        
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)

    test_score = r2_score(y_test, y_pred)
    train_score = ridge.score(X_train, y_train)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
   
    rmse_cv = np.sqrt(-cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean())
    
    ridge_row = {"Model": "Ridge", "Test Sc.": test_score, "Train Sc.": train_score
               ,"MAE": mae, "MSE": mse, "RMSE": rmse, "RMSE CV": rmse_cv}
    
    models = models.append(ridge_row, ignore_index=True)
    
    
    
    lasso = Lasso(alpha = 0.0001)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
   
    test_score = r2_score(y_test, y_pred)
    train_score = lasso.score(X_train, y_train)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
   
    rmse_cv = np.sqrt(-cross_val_score(lasso, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean())
    
    lasso_row = {"Model": "Lasso", "Test Sc.": test_score, "Train Sc.": train_score
               ,"MAE": mae, "MSE": mse, "RMSE": rmse, "RMSE CV": rmse_cv}
    
    models = models.append(lasso_row, ignore_index=True)
    
    
    dtr = DecisionTreeRegressor(min_samples_leaf=60)
    dtr.fit(X_train, y_train)
    y_pred= dtr.predict(X_test)
   
    test_score = r2_score(y_test, y_pred)
    train_score = dtr.score(X_train, y_train)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    rmse_cv = np.sqrt(-cross_val_score(dtr, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean())
    
    dtr_row = {"Model": "DTR", "Test Sc.": test_score, "Train Sc.": train_score
               ,"MAE": mae, "MSE": mse, "RMSE": rmse, "RMSE CV": rmse_cv}
    
    models = models.append(dtr_row, ignore_index=True)
    
    
    rfr = RandomForestRegressor(random_state = 42,
                                n_estimators = 100,
                                min_samples_split = 10,
                                min_samples_leaf = 1,
                                max_features = 'sqrt',
                                max_depth = 30,
                                bootstrap = True)
    rfr.fit(X_train, y_train)
    y_pred= rfr.predict(X_test)

    test_score = r2_score(y_test, y_pred)
    train_score = rfr.score(X_train, y_train)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    rmse_cv = np.sqrt(-cross_val_score(rfr, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean())

    
    rfr_row = {"Model": "RFR", "Test Sc.": test_score, "Train Sc.": train_score
               ,"MAE": mae, "MSE": mse, "RMSE": rmse, "RMSE CV": rmse_cv}
    
    models = models.append(rfr_row, ignore_index=True)
    
    display(models.style.highlight_min(subset=["MAE","MSE","RMSE", "RMSE CV"], 
                                                color = 'springgreen', axis = 0).highlight_max(
        subset=["Test Sc.", "Train Sc."], color = 'springgreen', axis = 0))
    
    
    fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(16,6), dpi=300)
    plt.subplots_adjust(hspace=1.2)
    sns.lineplot(x=models["Model"], y=models["Test Sc."], ax=ax1, marker="o")
    sns.lineplot(x=models["Model"], y=models["Train Sc."], ax=ax2, marker="o")
    sns.lineplot(x=models["Model"], y=models["MAE"], ax=ax3, marker="o")
    sns.lineplot(x=models["Model"], y=models["MSE"], ax=ax4, marker="o")
    sns.lineplot(x=models["Model"], y=models["RMSE"], ax=ax5, marker="o")
    sns.lineplot(x=models["Model"], y=models["RMSE CV"], ax=ax6, marker="o")
    ax1.set_title("Test Scores Comparison", size=18)
    ax2.set_title("Train Scores Comparison", size=18)
    ax3.set_title("MAE Scores Comparison", size=18)
    ax4.set_title("MSE Scores Comparison", size=18)
    ax5.set_title("RMSE Scores Comparison", size=18)
    ax6.set_title("RMSE CV Scores Comparison", size=18)
    ax1.tick_params(labelrotation=30)
    ax2.tick_params(labelrotation=30)
    ax3.tick_params(labelrotation=30)
    ax4.tick_params(labelrotation=30)
    ax5.tick_params(labelrotation=30)
    ax6.tick_params(labelrotation=30)
    plt.show()

In [None]:
models(X_train, X_test, y_train, y_test)

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,12), dpi=300)

fig.subplots_adjust(hspace=0.6, wspace=0.4)

ridge_visualizer = ResidualsPlot(Ridge(alpha=1.0), hist=False, qqplot=True, 
                                train_color="orange", test_color="deepskyblue", ax=ax1)



lasso_visualizer = ResidualsPlot(Lasso(alpha = 0.0001), hist=False, qqplot=True, 
                                train_color="orange", test_color="deepskyblue", ax=ax2)


dtr_visualizer = ResidualsPlot(DecisionTreeRegressor(min_samples_leaf=60), hist=False, qqplot=True, 
                                train_color="orange", test_color="deepskyblue", ax=ax3)

rfr_visualizer = ResidualsPlot(RandomForestRegressor(random_state = 42,
                                n_estimators = 100,
                                min_samples_split = 10,
                                min_samples_leaf = 1,
                                max_features = 'sqrt',
                                max_depth = 30,
                                bootstrap = True), hist=False, qqplot=True, 
                                train_color="orange", test_color="deepskyblue", ax=ax4)


ridge_visualizer.fit(X_train, y_train)
lasso_visualizer.fit(X_train, y_train)
dtr_visualizer.fit(X_train, y_train)
rfr_visualizer.fit(X_train, y_train)

ridge_visualizer.score(X_test, y_test)
lasso_visualizer.score(X_test, y_test)
dtr_visualizer.score(X_test, y_test)
rfr_visualizer.score(X_test, y_test)

ridge_visualizer.finalize()
lasso_visualizer.finalize()
dtr_visualizer.finalize()
rfr_visualizer.finalize()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,12), dpi=300)

fig.subplots_adjust(hspace=0.4, wspace=-0.5)

model_ridge = Ridge(alpha=1.0)

model_lasso = Lasso(alpha = 0.0001)

model_dtr = DecisionTreeRegressor(min_samples_leaf=60)

model_rfr = RandomForestRegressor(random_state = 42,
                                n_estimators = 100,
                                min_samples_split = 10,
                                min_samples_leaf = 1,
                                max_features = 'sqrt',
                                max_depth = 30,
                                bootstrap = True)

ridge_prediction_error = PredictionError(model_ridge, line_color="black", ax=ax1)
lasso_prediction_error = PredictionError(model_lasso, line_color="black", ax=ax2)
dtr_prediction_error = PredictionError(model_dtr, line_color="black", ax=ax3)
rfr_prediction_error = PredictionError(model_rfr, line_color="black", ax=ax4)

ridge_prediction_error.fit(X_train, y_train)
lasso_prediction_error.fit(X_train, y_train)
dtr_prediction_error.fit(X_train, y_train)
rfr_prediction_error.fit(X_train, y_train)

ridge_prediction_error.score(X_test, y_test) 
lasso_prediction_error.score(X_test, y_test) 
dtr_prediction_error.score(X_test, y_test) 
rfr_prediction_error.score(X_test, y_test)

ridge_prediction_error.finalize()  
lasso_prediction_error.finalize()  
dtr_prediction_error.finalize()  
rfr_prediction_error.finalize()  