## Prepare Review Data

* Used the google-play-scaper package to get recent reviews of the Cookpad app for Google devices. 
* See get_reviews.py for the code
* Now prepare the collected data for analysis.

In [19]:
# Import the required libraries
import pandas as pd

In [20]:
# read in the raw reviews and clean them up for our analysis
reviews_df = pd.read_csv('../data/cookpad_reviews_raw.csv')
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39820 entries, 0 to 39819
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              39820 non-null  object
 1   userName              39820 non-null  object
 2   userImage             39820 non-null  object
 3   content               39632 non-null  object
 4   score                 39820 non-null  int64 
 5   thumbsUpCount         39820 non-null  int64 
 6   reviewCreatedVersion  34777 non-null  object
 7   at                    39820 non-null  object
 8   replyContent          1920 non-null   object
 9   repliedAt             1920 non-null   object
dtypes: int64(2), object(8)
memory usage: 3.0+ MB


## Use a subset of the columns/attributes that were scraped. 
### Keep:
* reviewId
* content
* score
* reviewCreatedVersion (there are some nulls here so handle them)
* at

In [21]:
reviews_df.drop(['userName', 'replyContent', 'repliedAt', 'userImage', 'thumbsUpCount'], axis=1, inplace=True)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39820 entries, 0 to 39819
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              39820 non-null  object
 1   content               39632 non-null  object
 2   score                 39820 non-null  int64 
 3   reviewCreatedVersion  34777 non-null  object
 4   at                    39820 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [22]:
# check for duplicates 
reviews_df['reviewId'].unique().size

18210

In [23]:
# There were duplicate review ids...drop those duplicate rows
reviews_df.drop_duplicates(subset=['reviewId'], inplace=True)
print(reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18210 entries, 0 to 18209
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              18210 non-null  object
 1   content               18116 non-null  object
 2   score                 18210 non-null  int64 
 3   reviewCreatedVersion  15874 non-null  object
 4   at                    18210 non-null  object
dtypes: int64(1), object(4)
memory usage: 853.6+ KB
None


In [24]:
# Rename the at column for clarity 
reviews_df.rename(columns={'at':'reviewTimestamp'}, inplace=True)
# Set the reviewTimestamp to datetime
reviews_df['reviewTimestamp'] = pd.to_datetime(reviews_df['reviewTimestamp'])

# Create new columns - reviewYear as integer and reviewDate as datetime with format YYYY-MM
reviews_df['reviewYear'] = reviews_df.apply(lambda row: row['reviewTimestamp'].year, axis=1)
reviews_df['reviewMonth'] = reviews_df.apply(lambda row: row['reviewTimestamp'].month, axis=1)
reviews_df['reviewDateString'] = reviews_df.apply(lambda row: row['reviewTimestamp'].strftime('%Y-%m'), axis=1)

In [25]:
reviews_df.head(3)

Unnamed: 0,reviewId,content,score,reviewCreatedVersion,reviewTimestamp,reviewYear,reviewMonth,reviewDateString
0,gp:AOqpTOGCYCaovfg8OvqtDo95GDv3OWH5G3sNZkasoR-...,Awesome App .,5,2.211.0.0-android,2021-09-08 08:01:48,2021,9,2021-09
1,gp:AOqpTOF3lyN0oeSKoZC0LBywwlaDdd0Zvtnjdn1mPaE...,I can save my experimental recipe on Cookpad.....,5,2.211.0.0-android,2021-09-07 10:38:00,2021,9,2021-09
2,gp:AOqpTOGGD3zqw1hNQJLYYc3JGAsbqNGGzPdXCyWXnpr...,Superb! I love how it sources recipes from oth...,5,2.214.0.0-android,2021-09-07 09:11:37,2021,9,2021-09


In [26]:
# Check for unique values in the categorical columns
col_names=['reviewCreatedVersion', 'score', 'reviewYear']

for col_name in col_names:
    print(col_name)
    print(reviews_df[col_name].unique().size)

reviewCreatedVersion
528
score
6
reviewYear
12


In [27]:
# Set any null reviewCreatedVersion to Unknown
reviews_df.fillna({'reviewCreatedVersion':'Unknown'}, inplace=True)
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18210 entries, 0 to 18209
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              18210 non-null  object        
 1   content               18116 non-null  object        
 2   score                 18210 non-null  int64         
 3   reviewCreatedVersion  18210 non-null  object        
 4   reviewTimestamp       18210 non-null  datetime64[ns]
 5   reviewYear            18210 non-null  int64         
 6   reviewMonth           18210 non-null  int64         
 7   reviewDateString      18210 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 1.3+ MB


In [28]:
reviews_df['reviewYear'].unique()

array([2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011,
       2010])

In [30]:
reviews_df.to_csv('../data/cookpad_prepped_reviews.csv', index=None, header=True)