## Libraries used for EDA

In [1]:
import pandas as pd #data manipulation and analysis

## Overview of the Dataset

In [2]:
reviews= pd.read_csv('/content/drive/MyDrive/Airbnb/Airbnb_original_data/reviews2024.csv') #Load the dataset
reviews.head() #display few rows

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,31094,79346,2010-08-16,171607,Ben,"We had a great stay. Conveniently located, qui..."
1,31094,166275,2011-01-05,306860,Makita,It was a very good stay. The appartment was re...
2,31094,1452299,2012-06-10,1321058,Pierre,Really enjoyed my time at Ebbe's place. It is...
3,31094,6766430,2013-08-24,2182771,Sussie,"The apartment was very well located, 10-15 min..."
4,31094,6827217,2013-08-26,8025926,Wil,"This is a great flat, very clean with everythi..."


In [3]:
print(reviews.info()) #Understand the structure of the dataset: its dimensions, columns and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366636 entries, 0 to 366635
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     366636 non-null  int64 
 1   id             366636 non-null  int64 
 2   date           366636 non-null  object
 3   reviewer_id    366636 non-null  int64 
 4   reviewer_name  366636 non-null  object
 5   comments       366619 non-null  object
dtypes: int64(3), object(3)
memory usage: 16.8+ MB
None


## Data Type Adjustment

In [4]:
reviews['date'] = pd.to_datetime(reviews['date']) #'date' column to datetime format

## Handling Missing Values

In [5]:
print(reviews.isnull().sum()) #checking for missing values

listing_id        0
id                0
date              0
reviewer_id       0
reviewer_name     0
comments         17
dtype: int64


In [6]:
reviews = reviews.dropna(subset=['comments']) #drop rows with missing values in the 'comments' column

In [7]:
print(reviews.isnull().sum()) #checking that everything is ok

listing_id       0
id               0
date             0
reviewer_id      0
reviewer_name    0
comments         0
dtype: int64


## Exploring Variables

In [8]:
# Understand the date range in the dataset
min_date = reviews['date'].min()
max_date = reviews['date'].max()
print(f"Minimum Date: {min_date}")
print(f"Maximum Date: {max_date}")

Minimum Date: 2010-07-25 00:00:00
Maximum Date: 2024-06-29 00:00:00


In [9]:
# Is the number of reviews equal to the number of distinct values of id?
unique_count = reviews['id'].nunique()
unique_count

366619

In [10]:
# Corrections to strings
reviews['comments'] = reviews['comments'].str.replace(r'<br\s*/?>', ' ', regex=True)
reviews['comments'] = reviews['comments'].apply(
    lambda x: f'"{x}"' if not (x.startswith('"') and x.endswith('"')) else x
)
reviews['comments'] = reviews['comments'].str.replace('"""', '"', regex=False)
reviews['comments'] = reviews['comments'].str.replace('""', '"', regex=False)
reviews['comments'] = reviews['comments'].str.replace("'", "", regex=False)
reviews['comments'] = reviews['comments'].str.replace(r'\s+', ' ', regex=True)

In [11]:
reviews.to_csv('/content/drive/MyDrive/Airbnb/Airbnb_cleaned_csvs/reviews_cleaned.csv', index=False)