In [53]:
import pandas as pd

# Load Dataset

In [54]:
df = pd.read_csv("final_data/initial_dataset.csv") 
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,latitude,longitude,category,avg_rating,num_of_reviews,url,policy,fake
0,37198.0,1.1e+20,Michelle Banks,1520000000000.0,5.0,It's a beautiful place to read books and have ...,,,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,https://www.google.com/maps/place//data=!4m2!3...,relevant,no
1,39373.0,1.06e+20,Steven DeRyck [Staff],1540000000000.0,4.0,"As previous reviews have stated, two small pie...",,,0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,https://www.google.com/maps/place//data=!4m2!3...,relevant,no
2,43382.0,1.1e+20,Stevey Markovich,1600000000000.0,5.0,Absolutely love this office! Afton is truly am...,,"{'time': 1595600023968, 'text': 'Thank you! Ha...",0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,https://www.google.com/maps/place//data=!4m2!3...,relevant,no
3,9672.0,1.02e+20,William Campbell,1540000000000.0,3.0,The food is as good as it usually is,,,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,https://www.google.com/maps/place//data=!4m2!3...,relevant,no
4,92564.0,1.12e+20,Beverly Thorman,1520000000000.0,5.0,We came in without an appointment on a Saturda...,,"{'time': 1523207982441, 'text': 'Thank you for...",0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,https://www.google.com/maps/place//data=!4m2!3...,relevant,no


# Check Data Types

In [55]:
print(df.dtypes)

Unnamed: 0        float64
user_id           float64
name_x             object
time              float64
rating            float64
text               object
pics               object
resp               object
gmap_id            object
name_y             object
latitude          float64
longitude         float64
category           object
avg_rating        float64
num_of_reviews      int64
url                object
policy             object
fake               object
dtype: object


# Remove Irrelevant Columns

Columns `pics`, `resp`, `url`, `Unnamed: 0` are no longer relevant in the dataset as we would not be using these fields in the analysis downstream.

As such, we would remove these columns.

In [56]:
df = df.drop(columns=["pics", "resp", "url", "Unnamed: 0", "fake"])

# Rename Columns

We will rename columns to a more descriptive name so that it is more intuitive.

`name_x` will be renamed to `user_name`

`name_y` will be renamed to `business_name`

`text` will be renamed to `review`

`category` will be renamed to `business_desc`

`policy` will be renamed to `label`

In [57]:
df.rename(
    columns={
        "name_x": "user_name",
        "name_y": "business_name",
        "text": "review",
        "category": "business_desc",
        "policy": "label"
    },
    inplace=True
)

In [58]:
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,avg_rating,num_of_reviews,label
0,1.1e+20,Michelle Banks,1520000000000.0,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,relevant
1,1.06e+20,Steven DeRyck [Staff],1540000000000.0,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,relevant
2,1.1e+20,Stevey Markovich,1600000000000.0,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,relevant
3,1.02e+20,William Campbell,1540000000000.0,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,relevant
4,1.12e+20,Beverly Thorman,1520000000000.0,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,relevant


# Convert Data Type

The `time` column is provided in Unix time (milliseconds). 

Since this format is not easily readable or intuitive, we will convert it to a standard datetime format for easier interpretation and analysis.

In [59]:
df['time'] = pd.to_datetime(df['time'], unit='ms')

# Check Null Values in each column

In [60]:
print(df.isnull().sum())  

user_id           74
user_name         29
time               0
rating            74
review             0
gmap_id            0
business_name      1
latitude           0
longitude          0
business_desc     10
avg_rating         0
num_of_reviews     0
label              0
dtype: int64


# Handling Missing Values

Although columns like `user_id`, `user_name`, and `business_name` contain missing values, they will not serve as key features in our model development. 

Therefore, we will not perform any imputation or removal for these columns.

### `rating`

The `rating` column is a numerical variable. There are 74 rows with missing values, which we will impute using the median of the available ratings.

Since the number of missing rows is small, median imputation is unlikely to significantly alter the overall distribution of ratings.

In [61]:
median_rating = df['rating'].median()
df['rating'] = df['rating'].fillna(median_rating)

### `business_desc`

There are 5 rows with missing `business_desc` values. Since `business_desc` is a textual categorical column and will be important for downstream analysis, we will drop these 5 rows. 

They represent a very small proportion of the dataset, so their removal will not significantly affect our analysis.

In [62]:
df = df.dropna(subset=['business_desc'])

# After Handling of Missing Values

In [63]:
print(df.isnull().sum())  

user_id           74
user_name         29
time               0
rating             0
review             0
gmap_id            0
business_name      1
latitude           0
longitude          0
business_desc      0
avg_rating         0
num_of_reviews     0
label              0
dtype: int64


All missing values for the relevant columns to be used in our model development has been handled.

# Save Dataset

In [64]:
df.to_csv("final_data/cleaned_dataset.csv", index=False, encoding='utf-8-sig')