In [1]:
# Importing important libraries for this dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Data Cleaning

In [2]:
# Loading the dataset
df = pd.read_csv('AB_NYC_2019.csv')

### Inspecting the dataset

In [3]:
# Displaying the first few rows
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
# Checking the number of rows and columns in the data
rows, columns = df.shape
print(f"Rows: ", rows)
print(f"columns: ", columns)

Rows:  48895
columns:  16


In [5]:
# Show the basic info about the dataset, like column names, data types, and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [6]:
# Display summary statistics for numerical columns to identify range and outliers
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [7]:
# Unique values for each column to understand categorical variables better
for column in df.select_dtypes(include='object').columns:
    print(f"Unique values in {column}: {df[column].nunique()}")

Unique values in name: 47905
Unique values in host_name: 11452
Unique values in neighbourhood_group: 5
Unique values in neighbourhood: 221
Unique values in room_type: 3
Unique values in last_review: 1764


In [8]:
# Checking for missing values in each column
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

### Handling missing values

In [10]:
# Replacing missing values in name column with "Unknown"
df.fillna({'name':"Unknown"}, inplace=True)

In [11]:
# confirm that the action took place
null_names = df['name'].isnull().sum()
print(f"Nulls in name column: {null_names}")

Nulls in name column: 0


In [12]:
# Drop rows with missing host_name. 
# A room has to be occupied by someone, so blanks in this field don't make sense hence their removal
df.dropna(subset=['host_name'], inplace=True)

In [13]:
# Confirm blank host names were removed
null_host_names = df['host_name'].isnull().sum()
print(f"nulls in host_name column: {null_host_names}")

nulls in host_name column: 0


In [14]:
# Convert last_review to datetime as it is currently an object
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

In [15]:
# Rechecking the missing values after filling them in
df.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10047
reviews_per_month                 10047
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [16]:
# Checking for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

Duplicate rows: 0


### Outlier Detection and Handling

In [None]:
# Here we consider logical ranges for price, minimum_nights, and availability_365

In [18]:
# Price - Only keep listings within a reasonable range e.g. under $1000 per night
df = df[df['price'] <= 1000]

In [19]:
# Minimum Nights - Filter out listings with unrealistic minimum stay requirements
df = df[df['minimum_nights'] <= 365]

In [20]:
# Availability - Remove listings with no availability over the year, assuming they’re inactive
df = df[df['availability_365'] > 0]

### Standardize Text Data

Because python is sensitive with case, we will perform standardization on the columns with text data

In [21]:
# Convert neighbourhood and host_name to lowercase for uniformity
df['neighbourhood'] = df['neighbourhood'].str.lower()
df['host_name'] = df['host_name'].str.lower()

### Drop Irrelevant Columns

In [22]:
# Drop columns like 'id' and 'host_id' if they are not relevant to the analysis
df.drop(columns=['id', 'host_id'], inplace=True)

### Data consistency validation

In [23]:
# Listings with 0 price might indicate missing or incorrect data
df = df[df['price'] > 0]

In [24]:
# Listings with a last_review but no reviews might be inconsistent; set last_review to NaN in such cases
df.loc[df['reviews_per_month'] == 0, 'last_review'] = pd.NaT

### Re-verify Data Cleanliness

In [25]:
# Printing out a summaries to confirm the dataset is clean

In [26]:
print("\nFinal dataset summary:")
print(df.info())
print("\nCheck for any remaining missing values:")
print(df.isnull().sum())
print("\nCheck descriptive statistics for any new potential outliers:")
print(df.describe())


Final dataset summary:
<class 'pandas.core.frame.DataFrame'>
Index: 31160 entries, 0 to 48894
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   name                            31160 non-null  object        
 1   host_name                       31160 non-null  object        
 2   neighbourhood_group             31160 non-null  object        
 3   neighbourhood                   31160 non-null  object        
 4   latitude                        31160 non-null  float64       
 5   longitude                       31160 non-null  float64       
 6   room_type                       31160 non-null  object        
 7   price                           31160 non-null  int64         
 8   minimum_nights                  31160 non-null  int64         
 9   number_of_reviews               31160 non-null  int64         
 10  last_review                     26047 non-null  dat