# **Data Collection and Cleaning (Reviews)**
*Data pulled from [Inside Airbnb](http://insideairbnb.com/)*

----
### Imports

In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup

------
## Read in the Data

In [204]:
# reading in the reviews file
reviews = pd.read_csv('../data/boston_reviews.csv')
print(reviews.shape)
reviews.head()

(228388, 6)


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3781,37776825,2015-07-10,36059247,Greg,The apartment was as advertised and Frank was ...
1,3781,41842494,2015-08-09,10459388,Tai,It was a pleasure to stay at Frank's place. Th...
2,3781,45282151,2015-09-01,12264652,Damien,The apartment description is entirely faithful...
3,3781,49022647,2015-09-30,41426327,Mike,Thoroughly enjoyed my time at Frank's home. Ha...
4,3781,52503327,2015-10-30,15151513,Ivan,Great value for the money! This location has e...


-------
## Data Cleaning - reviews data

#### *Handling null values*

In [205]:
reviews.isnull().sum()

listing_id        0
id                0
date              0
reviewer_id       0
reviewer_name     1
comments         44
dtype: int64

In [206]:
reviews[reviews['comments'].isna() == True]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
5797,1472520,307036753,2018-08-13,181686521,Conor,
10249,1810172,84265295,2016-07-06,40584930,Luis,
24788,2676488,444187277048685433,2021-09-04,414697087,Aaron,
29337,3504918,134766534,2017-02-28,85115405,David,
36418,3969526,277569006,2018-06-16,171513185,Nan,
40569,4863108,261461592,2018-05-06,164531748,Shanna,
45172,5371051,552387535,2019-10-21,261173588,S,
63583,7513605,226610933,2018-01-11,37821446,Lillian (Aka Ann),
63922,7728499,253482151,2018-04-14,5523316,Amelia,
70115,9545482,617359086610114857,2022-05-01,75678589,Juan Pablo,


In [207]:
# drop the rows with null comments - we wouldn't be able to perform NLP on these
reviews.dropna(subset = 'comments', inplace=True)
reviews.isnull().sum()

listing_id       0
id               0
date             0
reviewer_id      0
reviewer_name    1
comments         0
dtype: int64

In [208]:
# keep the null reviewer_name row for now 
# feature likely won't be used for modeling, and want to include rest of row for model training 
reviews[reviews['reviewer_name'].isna() == True]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
77993,11357756,424952679,2019-03-17,67667870,,Overall a good location for short-term stay.


------
#### *Checking data types*

In [209]:
reviews.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [210]:
# date column to be updated in next notebook prior to merging 

---- 
## *Save the clean reviews csv*

In [211]:
# Save clean listings csv
reviews.to_csv('../data/reviews_cleaned.csv', date_format='%Y-%m-%d', index = False)