In [334]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.patches as patches
# Required imports
import pandas as pd
pd.set_option('display.max_columns', None)  # This will display all columns


In [294]:
dfc = pd.read_csv('calendar.csv')
dfl = pd.read_csv('listings.csv')
dfr = pd.read_csv('reviews.csv')

# Airbnb Listings Data in Seattle

The following datasets focus on Airbnb listings data in Seattle:

1. **`dfc`**: Calendars Data
   - Lists each listing's daily availability and price between January 2016 and January 2017.


2. **`dfl`**: Detailed Listing Information
   - Includes data on prices, fees, location, amenities, and reviews for each listing.


3. **`dfr`**: Reviews Data
   - Contains detailed information about the reviews given to the listings, including ratings and text reviews.


# Target Research Questions

The following are the three target research questions:

1. What months witnessed the peak of the average listing nightly price in Seattle?
2. What are the top 3 listing neighborhoods in terms of average nightly price in Seattle?
3. What factors are most correlated with predicting the listing nightly price?


# Analysis Plan

- To answer Q1, we'll use the `dfc` dataset.
- To answer Q2 and Q3, we'll use the `dfl` dataset.
- The `dfr` dataset will not be necessary for this analysis, as we have ratings score data per listing in `dfl`; therefore, it will not be needed.


# Step 1: Data Assessment

## 1. dfc

In [295]:
print(dfc.head())
print(dfc.shape)

   listing_id        date available   price
0      241032  2016-01-04         t  $85.00
1      241032  2016-01-05         t  $85.00
2      241032  2016-01-06         f     NaN
3      241032  2016-01-07         f     NaN
4      241032  2016-01-08         f     NaN
(1393570, 4)


In [296]:
#checking for data types
dfc.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [297]:
dfc.describe()

Unnamed: 0,listing_id
count,1393570.0
mean,5550111.0
std,2962274.0
min,3335.0
25%,3258213.0
50%,6118244.0
75%,8035212.0
max,10340160.0


In [298]:
#checking for nulls
dfc.isna().sum()

listing_id         0
date               0
available          0
price         459028
dtype: int64

In [299]:
#comparing to see if the nulls in price is only when it's not available
dfc[dfc['available']=='f'].isna().sum()

listing_id         0
date               0
available          0
price         459028
dtype: int64

In [300]:
#checking for duplicates
dfc.duplicated().sum()

0

### dfc Quality Assessment

- The `dfc` dataset is very clean; however, the data types need to be fixed.
- The null values in the price column are logical, as they indicate that the listing is not available for booking on the given day.
- During data wrangling, these rows will be removed.
- No duplicate rows found.


## 2. dfl

In [301]:
print(dfl.head())
print(dfl.shape)

        id                           listing_url       scrape_id last_scraped  \
0   241032   https://www.airbnb.com/rooms/241032  20160104002432   2016-01-04   
1   953595   https://www.airbnb.com/rooms/953595  20160104002432   2016-01-04   
2  3308979  https://www.airbnb.com/rooms/3308979  20160104002432   2016-01-04   
3  7421966  https://www.airbnb.com/rooms/7421966  20160104002432   2016-01-04   
4   278830   https://www.airbnb.com/rooms/278830  20160104002432   2016-01-04   

                                  name  \
0         Stylish Queen Anne Apartment   
1   Bright & Airy Queen Anne Apartment   
2  New Modern House-Amazing water view   
3                   Queen Anne Chateau   
4       Charming craftsman 3 bdm house   

                                             summary  \
0                                                NaN   
1  Chemically sensitive? We've removed the irrita...   
2  New modern house built in 2013.  Spectacular s...   
3  A charming apartment that sits at

In [302]:
#checking for data types
dfl.dtypes

id                                    int64
listing_url                          object
scrape_id                             int64
last_scraped                         object
name                                 object
summary                              object
space                                object
description                          object
experiences_offered                  object
neighborhood_overview                object
notes                                object
transit                              object
thumbnail_url                        object
medium_url                           object
picture_url                          object
xl_picture_url                       object
host_id                               int64
host_url                             object
host_name                            object
host_since                           object
host_location                        object
host_about                           object
host_response_time              

In [303]:
dfl.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,square_feet,guests_included,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,reviews_per_month
count,3818.0,3818.0,3818.0,3816.0,3816.0,3818.0,3818.0,3818.0,3802.0,3812.0,3817.0,97.0,3818.0,3818.0,3818.0,3818.0,3818.0,3818.0,3818.0,3818.0,3171.0,3160.0,3165.0,3160.0,3167.0,3163.0,3162.0,0.0,3818.0,3191.0
mean,5550111.0,20160100000000.0,15785560.0,7.157757,7.157757,47.628961,-122.333103,3.349398,1.259469,1.307712,1.735394,854.618557,1.672603,2.369303,780.447617,16.786276,36.814825,58.082504,244.772656,22.223415,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,,2.946307,2.078919
std,2962660.0,0.0,14583820.0,28.628149,28.628149,0.043052,0.031745,1.977599,0.590369,0.883395,1.13948,671.404893,1.31104,16.305902,1683.589007,12.173637,23.337541,34.063845,126.772526,37.730892,6.606083,0.698031,0.797274,0.595499,0.568211,0.629053,0.750259,,5.893029,1.822348
min,3335.0,20160100000000.0,4193.0,1.0,1.0,47.505088,-122.417219,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,2.0,3.0,2.0,2.0,4.0,2.0,,1.0,0.02
25%,3258256.0,20160100000000.0,3275204.0,1.0,1.0,47.609418,-122.35432,2.0,1.0,1.0,1.0,420.0,1.0,1.0,60.0,2.0,13.0,28.0,124.0,2.0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,,1.0,0.695
50%,6118244.0,20160100000000.0,10558140.0,1.0,1.0,47.623601,-122.328874,3.0,1.0,1.0,1.0,750.0,1.0,2.0,1125.0,20.0,46.0,73.0,308.0,9.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,,1.0,1.54
75%,8035127.0,20160100000000.0,25903090.0,3.0,3.0,47.662694,-122.3108,4.0,1.0,2.0,2.0,1200.0,2.0,2.0,1125.0,30.0,59.0,89.0,360.0,26.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,,2.0,3.0
max,10340160.0,20160100000000.0,53208610.0,502.0,502.0,47.733358,-122.240607,16.0,8.0,7.0,15.0,3000.0,15.0,1000.0,100000.0,30.0,60.0,90.0,365.0,474.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,,37.0,12.15


In [304]:
# Too many columns to display, viewing them alone
dfl.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

In [305]:
#checking for nulls
dfl.isna().sum()

id                                     0
listing_url                            0
scrape_id                              0
last_scraped                           0
name                                   0
summary                              177
space                                569
description                            0
experiences_offered                    0
neighborhood_overview               1032
notes                               1606
transit                              934
thumbnail_url                        320
medium_url                           320
picture_url                            0
xl_picture_url                       320
host_id                                0
host_url                               0
host_name                              2
host_since                             2
host_location                          8
host_about                           859
host_response_time                   523
host_response_rate                   523
host_acceptance_

In [306]:
#creating a separate df to view columns with nulls
na_summary = dfl.isna().sum().reset_index()
na_summary.columns = ['Column', 'Missing Values']
na_summary = na_summary[na_summary['Missing Values'] != 0]
na_summary

Unnamed: 0,Column,Missing Values
5,summary,177
6,space,569
9,neighborhood_overview,1032
10,notes,1606
11,transit,934
12,thumbnail_url,320
13,medium_url,320
15,xl_picture_url,320
18,host_name,2
19,host_since,2


In [307]:
#creating a separate df to view columns dtypes
pd.set_option('display.max_rows', None)

dtype_summary = dfl.dtypes.reset_index()
dtype_summary.columns = ['Column', 'dtype']
dtype_summary

Unnamed: 0,Column,dtype
0,id,int64
1,listing_url,object
2,scrape_id,int64
3,last_scraped,object
4,name,object
5,summary,object
6,space,object
7,description,object
8,experiences_offered,object
9,neighborhood_overview,object


In [308]:
dfl.duplicated().sum()

0

### dfl Quality Assessment

- The `dfl` dataset has 92 columns, a lot of the columns are duplicates of eachother, and many of them will not provide value to the analysis, therefore they'll be dropped.
- dtypes will need some augmenting, especially for numerical features that are being stores as objects, and some times as floats instead of integers.
- No duplicates.
- Null values are present in a lot of columns, they'll be dealt with after dropping the unnecessary features first.


# Step 2: Data Cleaning & Wrangling

## 1. dfc

In [309]:
dfc.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [310]:
# formatting price as number and removing any syntax
dfc['price'] = dfc['price'].replace({r'\$': '', r',': ''}, regex=True).astype(float)
#confirming the dtype change
dfc.dtypes

listing_id      int64
date           object
available      object
price         float64
dtype: object

In [311]:
# Switching the date column to datetime
dfc['date'] = pd.to_datetime(dfc['date'])

#confirming the dtype change
dfc.dtypes


listing_id             int64
date          datetime64[ns]
available             object
price                float64
dtype: object

In [312]:
# To use dfc in answering Q1, we'll need to regroup the df to focus only on the timeseries data rather than the listing data.
# First, we filter for data rows where it was available, and therefore had a price point.
available = dfc[dfc['available']=='t']

In [313]:
# Create a new column that extracts the month and year from the 'date' column
available['month_year'] = available['date'].dt.to_period('M').dt.to_timestamp()
available.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  available['month_year'] = available['date'].dt.to_period('M').dt.to_timestamp()


Unnamed: 0,listing_id,date,available,price,month_year
0,241032,2016-01-04,t,85.0,2016-01-01
1,241032,2016-01-05,t,85.0,2016-01-01
9,241032,2016-01-13,t,85.0,2016-01-01
10,241032,2016-01-14,t,85.0,2016-01-01
14,241032,2016-01-18,t,85.0,2016-01-01


In [314]:
#creating a grouped df for the calendar month and the average price.
grouped = available.groupby('month_year')['price'].mean().round(2).reset_index()

grouped.head()

Unnamed: 0,month_year,price
0,2016-01-01,121.57
1,2016-02-01,124.29
2,2016-03-01,128.64
3,2016-04-01,135.1
4,2016-05-01,139.54


## 2. dfl

In [315]:
dtype_summary

Unnamed: 0,Column,dtype
0,id,int64
1,listing_url,object
2,scrape_id,int64
3,last_scraped,object
4,name,object
5,summary,object
6,space,object
7,description,object
8,experiences_offered,object
9,neighborhood_overview,object


In [316]:
# Issue 1: Too many features. Analyzed the columns and decided on the set of columns that are necessary for the anaylsis.
# and these will be stored in a new df called ndfl.

ndfl = dfl[['id','experiences_offered','host_is_superhost','neighbourhood_cleansed',
       'neighbourhood_group_cleansed','zipcode','property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
       'price','number_of_reviews','review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value']]
ndfl.sample(10)

Unnamed: 0,id,experiences_offered,host_is_superhost,neighbourhood_cleansed,neighbourhood_group_cleansed,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
1046,7664565,none,t,Atlantic,Central Area,98144,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",,$60.00,25,98.0,10.0,10.0,10.0,10.0,9.0,10.0
1181,4824699,none,f,Lawton Park,Magnolia,98199,House,Entire home/apt,4,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Free Pa...",,$147.00,0,,,,,,,
181,2339333,none,f,West Woodland,Ballard,98107,House,Entire home/apt,8,3.5,4.0,4.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",,$350.00,6,100.0,10.0,10.0,10.0,10.0,9.0,9.0
605,143704,none,f,Mann,Central Area,98122,House,Entire home/apt,3,1.0,1.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Free Pa...",,$82.00,181,95.0,9.0,10.0,10.0,10.0,9.0,9.0
1739,8703208,none,t,Alki,West Seattle,98116,House,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",,$119.00,4,100.0,10.0,10.0,10.0,10.0,10.0,10.0
3487,10060388,none,f,South Park,Other neighborhoods,98108,House,Entire home/apt,6,1.5,3.0,3.0,Real Bed,"{TV,Internet,""Wireless Internet"",""Air Conditio...",,$100.00,0,,,,,,,
2285,7534300,none,f,Mount Baker,Rainier Valley,98144,Bed & Breakfast,Private room,3,2.0,1.0,2.0,Real Bed,"{""Cable TV"",""Wireless Internet"",""Air Condition...",,$125.00,6,100.0,10.0,10.0,10.0,10.0,10.0,10.0
2848,9328692,none,f,Broadway,Capitol Hill,98102,Apartment,Entire home/apt,1,1.0,0.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",He...",,$49.00,0,,,,,,,
2182,6546748,none,f,Dunlap,Rainier Valley,98118,House,Entire home/apt,6,3.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",,$295.00,7,100.0,9.0,9.0,10.0,9.0,9.0,10.0
136,1364440,none,f,West Woodland,Ballard,98107,Apartment,Private room,1,1.0,1.0,1.0,Futon,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",,$55.00,20,100.0,10.0,10.0,10.0,10.0,9.0,10.0


In [317]:
ndfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            3818 non-null   int64  
 1   experiences_offered           3818 non-null   object 
 2   host_is_superhost             3816 non-null   object 
 3   neighbourhood_cleansed        3818 non-null   object 
 4   neighbourhood_group_cleansed  3818 non-null   object 
 5   zipcode                       3811 non-null   object 
 6   property_type                 3817 non-null   object 
 7   room_type                     3818 non-null   object 
 8   accommodates                  3818 non-null   int64  
 9   bathrooms                     3802 non-null   float64
 10  bedrooms                      3812 non-null   float64
 11  beds                          3817 non-null   float64
 12  bed_type                      3818 non-null   object 
 13  ame

In [318]:
# Looking into experiences offered column since visual inspection showed a lot of 'None', values.
ndfl.experiences_offered.value_counts()

experiences_offered
none    3818
Name: count, dtype: int64

In [319]:
# The entire column has the value 'None', therefore should be dropped
ndfl.drop(columns='experiences_offered', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.drop(columns='experiences_offered', inplace=True)


In [320]:
ndfl.host_is_superhost.value_counts()

host_is_superhost
f    3038
t     778
Name: count, dtype: int64

In [321]:
# Convert superhost values to boolean 0 and 1
ndfl['host_is_superhost'] = ndfl['host_is_superhost'].replace({'f': 0, 't': 1})

# Verify the change
ndfl['host_is_superhost'].value_counts()

  ndfl['host_is_superhost'] = ndfl['host_is_superhost'].replace({'f': 0, 't': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl['host_is_superhost'] = ndfl['host_is_superhost'].replace({'f': 0, 't': 1})


host_is_superhost
0.0    3038
1.0     778
Name: count, dtype: int64

In [322]:
# Removing "_cleansed" from the column name
ndfl.rename(columns={'neighbourhood_cleansed': 'neighbourhood', 
                    'neighbourhood_group_cleansed': 'neighbourhood_group'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.rename(columns={'neighbourhood_cleansed': 'neighbourhood',


In [323]:
ndfl.amenities.value_counts()

amenities
{}                                                                                                                                                                                                                                                                                                                                                                                                                                                                           45
{Internet,"Wireless Internet",Kitchen,"Free Parking on Premises",Heating,Washer,Dryer}                                                                                                                                                                                                                                                                                                                                                                                       11
{TV,"Cable TV",Internet,"Wireless Internet",Pool,Kitchen,"Free

In [324]:
# Strip all special characters from the values in the amenities field
# Create a list with all the unique amenities found in the df
# Create a column for each amenity and if it was in the corresponding amenity column value, it becomes 1, else 0.

# Step 1: Clean and convert 'amenities' into sets
# This will split the amenities by commas properly
ndfl.loc[:, 'amenities'] = ndfl['amenities'].apply(lambda x: set(x.strip('{}').replace('"', '').split(',')))

# Step 2: Get the list of all unique amenities
all_amenities = set().union(*ndfl['amenities'])

# Step 3: Create a column for each unique amenity and set value 1 if present, otherwise 0
for amenity in all_amenities:
    ndfl.loc[:, amenity] = ndfl['amenities'].apply(lambda x: 1 if amenity.strip() in x else 0)

# Dropping the original 'amenities' column.
ndfl.drop('amenities', axis=1, inplace=True)

# Display the updated DataFrame (for checking results)
ndfl.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.loc[:, amenity] = ndfl['amenities'].apply(lambda x: 1 if amenity.strip() in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.loc[:, amenity] = ndfl['amenities'].apply(lambda x: 1 if amenity.strip() in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.loc[:, amenity] = 

Unnamed: 0,id,host_is_superhost,neighbourhood,neighbourhood_group,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,square_feet,price,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Unnamed: 23,Smoking Allowed,Hair Dryer,Family/Kid Friendly,Heating,Essentials,Kitchen,Carbon Monoxide Detector,Suitable for Events,Dryer,Other pet(s),Doorman,Washer / Dryer,Gym,Cat(s),Fire Extinguisher,Shampoo,Internet,Breakfast,Safety Card,Wheelchair Accessible,TV,Washer,Pets Allowed,Indoor Fireplace,Elevator in Building,Laptop Friendly Workspace,First Aid Kit,Free Parking on Premises,Hangers,Cable TV,Buzzer/Wireless Intercom,Hot Tub,Smoke Detector,Iron,Lock on Bedroom Door,Dog(s),24-Hour Check-in,Air Conditioning,Pool,Wireless Internet,Pets live on this property
0,241032,0.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,,$85.00,207,95.0,10.0,10.0,10.0,10.0,9.0,10.0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,953595,1.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,,$150.00,43,96.0,10.0,10.0,10.0,10.0,10.0,10.0,0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0
2,3308979,0.0,West Queen Anne,Queen Anne,98119,House,Entire home/apt,11,4.5,5.0,7.0,Real Bed,,$975.00,20,97.0,10.0,10.0,10.0,10.0,10.0,10.0,0,0,0,1,1,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1
3,7421966,0.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,3,1.0,0.0,2.0,Real Bed,,$100.00,0,,,,,,,,0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,278830,0.0,West Queen Anne,Queen Anne,98119,House,Entire home/apt,6,2.0,3.0,3.0,Real Bed,,$450.00,38,92.0,9.0,9.0,10.0,10.0,9.0,9.0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0


In [325]:
ndfl.drop(columns='', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.drop(columns='', inplace=True)


In [326]:
ndfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 63 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           3818 non-null   int64  
 1   host_is_superhost            3816 non-null   float64
 2   neighbourhood                3818 non-null   object 
 3   neighbourhood_group          3818 non-null   object 
 4   zipcode                      3811 non-null   object 
 5   property_type                3817 non-null   object 
 6   room_type                    3818 non-null   object 
 7   accommodates                 3818 non-null   int64  
 8   bathrooms                    3802 non-null   float64
 9   bedrooms                     3812 non-null   float64
 10  beds                         3817 non-null   float64
 11  bed_type                     3818 non-null   object 
 12  square_feet                  97 non-null     float64
 13  price             

In [327]:
# Remove dollar sign and commas in price column, then convert to float
ndfl['price'] = dfl['price'].replace({'\$': '', ',': ''}, regex=True).astype(float).round(2)

# Verify the conversion
print(ndfl['price'].head())

0     85.0
1    150.0
2    975.0
3    100.0
4    450.0
Name: price, dtype: float64


  ndfl['price'] = dfl['price'].replace({'\$': '', ',': ''}, regex=True).astype(float).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl['price'] = dfl['price'].replace({'\$': '', ',': ''}, regex=True).astype(float).round(2)


In [328]:
# reviewing Nulls in updated ndfl
print(ndfl.isna().sum().sort_values(ascending=False))
print(ndfl.shape)

square_feet                    3721
review_scores_accuracy          658
review_scores_checkin           658
review_scores_value             656
review_scores_location          655
review_scores_cleanliness       653
review_scores_communication     651
review_scores_rating            647
bathrooms                        16
zipcode                           7
bedrooms                          6
host_is_superhost                 2
beds                              1
property_type                     1
Pets Allowed                      0
TV                                0
Free Parking on Premises          0
First Aid Kit                     0
Wheelchair Accessible             0
Washer                            0
Laptop Friendly Workspace         0
Elevator in Building              0
Cable TV                          0
Indoor Fireplace                  0
Hangers                           0
id                                0
Buzzer/Wireless Intercom          0
Hot Tub                     

**Null analysis in ndfl**

Total Columns: 63, Total Rows: 3818
1. square_feet: 3721 Nulls - The majority of this column is Null. Should be dropped, escpecially that other columns like bedrooms, bathrooms and property type would still provide an equivalent contribution to the size of the listing.
2. bathrooms, zipcode, bedrooms, host_is_superhost, beds & property_type columns have null values between 1 and 16. Rows should be dropped.
3.  review_scores columns: Null values range between 647 and 658. Imputation using KNN transformation would be adequate in this scenario.


In [329]:
#Dropping the square_feet column
ndfl.drop(columns='square_feet',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndfl.drop(columns='square_feet',inplace=True)


In [330]:
ndfl.isna().sum()

id                               0
host_is_superhost                2
neighbourhood                    0
neighbourhood_group              0
zipcode                          7
property_type                    1
room_type                        0
accommodates                     0
bathrooms                       16
bedrooms                         6
beds                             1
bed_type                         0
price                            0
number_of_reviews                0
review_scores_rating           647
review_scores_accuracy         658
review_scores_cleanliness      653
review_scores_checkin          658
review_scores_communication    651
review_scores_location         655
review_scores_value            656
Smoking Allowed                  0
Hair Dryer                       0
Family/Kid Friendly              0
Heating                          0
Essentials                       0
Kitchen                          0
Carbon Monoxide Detector         0
Suitable for Events 

In [331]:
#Dropping rows with Nulls in bathrooms, zipcode, bedrooms, host_is_superhost, beds & property_type columns
ndfl = ndfl.dropna(subset=['bathrooms', 'zipcode', 'bedrooms', 'host_is_superhost', 'beds', 'property_type'], axis=0)

In [332]:
# Replacing Nulls in review score columns with the mean of each column.
# Specify the review score columns
review_score_columns = [
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value']

# Perform mean imputation
for x in review_score_columns:
    mean_value = ndfl[x].mean()  # Calculate mean of the column
    ndfl[x].fillna(mean_value, inplace=True)  # Fill NaN with the mean value


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ndfl[x].fillna(mean_value, inplace=True)  # Fill NaN with the mean value


In [335]:
#switching bathrooms, bedrooms & beds dtype to integer

ndfl['bathrooms'] = ndfl['bathrooms'].astype(int)  
ndfl['bedrooms'] = ndfl['bedrooms'].astype(int)    
ndfl['beds'] = ndfl['beds'].astype(int)            
ndfl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3786 entries, 0 to 3817
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           3786 non-null   int64  
 1   host_is_superhost            3786 non-null   float64
 2   neighbourhood                3786 non-null   object 
 3   neighbourhood_group          3786 non-null   object 
 4   zipcode                      3786 non-null   object 
 5   property_type                3786 non-null   object 
 6   room_type                    3786 non-null   object 
 7   accommodates                 3786 non-null   int64  
 8   bathrooms                    3786 non-null   int64  
 9   bedrooms                     3786 non-null   int64  
 10  beds                         3786 non-null   int64  
 11  bed_type                     3786 non-null   object 
 12  price                        3786 non-null   float64
 13  number_of_reviews      

In [276]:
ndfl.head()

Unnamed: 0,id,host_is_superhost,neighbourhood,neighbourhood_group,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,square_feet,price,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Smoking Allowed,Hair Dryer,Family/Kid Friendly,Heating,Essentials,Kitchen,Carbon Monoxide Detector,Suitable for Events,Dryer,Other pet(s),Doorman,Washer / Dryer,Gym,Cat(s),Fire Extinguisher,Shampoo,Internet,Breakfast,Safety Card,Wheelchair Accessible,TV,Washer,Pets Allowed,Indoor Fireplace,Elevator in Building,Laptop Friendly Workspace,First Aid Kit,Free Parking on Premises,Hangers,Cable TV,Buzzer/Wireless Intercom,Hot Tub,Smoke Detector,Iron,Lock on Bedroom Door,Dog(s),24-Hour Check-in,Air Conditioning,Pool,Wireless Internet,Pets live on this property
0,241032,0.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,,85.0,207,95.0,10.0,10.0,10.0,10.0,9.0,10.0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,953595,1.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,,150.0,43,96.0,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0
2,3308979,0.0,West Queen Anne,Queen Anne,98119,House,Entire home/apt,11,4.5,5.0,7.0,Real Bed,,975.0,20,97.0,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,1,1,1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1
3,7421966,0.0,West Queen Anne,Queen Anne,98119,Apartment,Entire home/apt,3,1.0,0.0,2.0,Real Bed,,100.0,0,,,,,,,,0,0,1,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,278830,0.0,West Queen Anne,Queen Anne,98119,House,Entire home/apt,6,2.0,3.0,3.0,Real Bed,,450.0,38,92.0,9.0,9.0,10.0,10.0,9.0,9.0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0
