# Collect and Prepare Review Data

### 
* Use the google-play-scaper package to get a reasonable sample of reviews of the Cookpad app downloaded from Google Play. 
* Prepare the collected data for analysis.

In [127]:
# Import the required libraries
import json
import pandas as pd

from google_play_scraper import Sort, reviews, app

In [128]:
# Scrape the info for the Cookpad app

# Confirm we have the right package name for the Cookpad app :)
app_info  = app('com.mufumbo.android.recipe.search', lang='en', country='uk')
print(app_info)

{'title': 'Cookpad - Create your own Recipes', 'description': 'Looking for recipe inspiration? Look no further. Welcome to Cookpad, the world’s largest community of home cooks where everyday, people just like you share thousands of recipes, ideas and cooking experiences.\r\n\r\nEach Cookpad recipe has been created, tried and tested by home cooks, for home cooks so whether you love to create, or just want to cook, Cookpad is the place for you.\r\n\r\n- Browse a diverse collection of recipes, or search by specifics - ingredients, seasonal dishes, holiday favourites and more\r\n- Create and share your own recipes and help others to become better cooks\r\n- Share photos (or cooksnaps) of the recipes you cook and inspire others to try them too\r\n- Connect with other home cooks, ask questions and share ideas\r\n- Build a limitless collection of recipes you find, share and cook - it’s your own personal cookbook\r\n\r\nPlease contact us if you need any further information or help at help@cook

In [129]:
# When doing sentiment analysis on the review text, we'll want to use a balanced sample, roughly the same number of reviews for each score (1-5). For general EDA purposes, let's focus on reviews in the XX timeframe. When we do sentiment/text analysis, we'll pull out a subset of the data in order to create a balanced sample.

In [151]:
# Get reviews
collected_reviews = []

review_result, continuation_token = reviews(
                                            'com.mufumbo.android.recipe.search',
                                            lang='en',
                                            country='uk',
                                            sort=Sort.NEWEST,
                                            count=200)

In [152]:
print(len(review_result))

200


In [153]:
review_result[199]

{'reviewId': 'gp:AOqpTOFyiMt8YqAzHS8aOTZpM7NVvt4SxtA8-fLItgXKBTIGXDdxobDpbsTKDWsEU33vAdi-O6Tuq1i5bkhhRF4',
 'userName': 'D Wijaya',
 'userImage': 'https://play-lh.googleusercontent.com/a-/AOh14GhX4QjUeLwZ-R4FR0vdXTqD1lXnh-4yH7gtHgrlAw',
 'content': 'Bug done ...thx a lot developer',
 'score': 5,
 'thumbsUpCount': 0,
 'reviewCreatedVersion': '2.162.3.0-android',
 'at': datetime.datetime(2021, 2, 2, 11, 52, 53),
 'replyContent': None,
 'repliedAt': None}

In [154]:
collected_reviews.extend(review_result)
print(len(collected_reviews))

200


In [155]:
for pagination_count in range(21):
    review_result, continuation_token = reviews('com.mufumbo.android.recipe.search',
                                                continuation_token=continuation_token)
    collected_reviews.extend(review_result)
    print(len(collected_reviews))
    print(collected_reviews[len(collected_reviews) -1])

400
{'reviewId': 'gp:AOqpTOF14mbgisKpCVctQ_75X7QDXtK6GBgH4cVhBdSBJf7zI7t5-yhI6DNOplsjFtLPDpWjscv8fneuutNR3rg', 'userName': 'Whb Bsb', 'userImage': 'https://play-lh.googleusercontent.com/a-/AOh14GgcER2WoC9ctgjisBIg7NiIMWoQ8eOo_NO9-O7S', 'content': "I love this app...it is very easy to use and it's halp me to cook better....good luck and we are waiting apps like this.", 'score': 5, 'thumbsUpCount': 4, 'reviewCreatedVersion': '2.178.1.0-android', 'at': datetime.datetime(2020, 12, 10, 23, 49, 29), 'replyContent': None, 'repliedAt': None}
600
{'reviewId': 'gp:AOqpTOF877yxMKCilfnDxKcYcOXRfp4BYB_2jxPV18WgkDAwpH9rSS29zT-pMJpCC3cSFoRviDoLqcXmy3sGRuM', 'userName': 'Winnie kulumba', 'userImage': 'https://play-lh.googleusercontent.com/a-/AOh14GjQ3M3L1C_caH9MSg3QH_ajSPuxxgypM2I5-vxMOA', 'content': 'awesome app, loved the experience', 'score': 5, 'thumbsUpCount': 0, 'reviewCreatedVersion': '2.168.0.0-android', 'at': datetime.datetime(2020, 10, 27, 13, 8, 42), 'replyContent': None, 'repliedAt': None}

In [186]:
app_reviews_df = pd.DataFrame(collected_reviews)
app_reviews_df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
0,gp:AOqpTOGBV4KS2fN18xajDXPV38LK3nJ09sBbfcuLClS...,Lori Dumm,https://play-lh.googleusercontent.com/-bpz_pGY...,All So Awesome,5,0,2.194.1.0-android,2021-04-02 09:25:12,,NaT
1,gp:AOqpTOEAibGyA69S9nq9HcWFihQnZ-yCeUSCYRmMcs2...,Aisha Mahmoud,https://play-lh.googleusercontent.com/--QqEgPR...,"Really, useful app❤",5,0,2.194.1.0-android,2021-04-02 08:06:59,,NaT
2,gp:AOqpTOHWPSxiQWVLOWula5YBNht1-bBp4BYa-kx2F1z...,Mia Mae,https://play-lh.googleusercontent.com/a-/AOh14...,Bermanfaat sekali untuk yg baru belajar masak👍👍,5,0,2.182.1.0-android,2021-04-02 05:18:30,,NaT
3,gp:AOqpTOH0ihmhzPKVhqYJPMz0JzFdWC-B6VHVzOMmJMF...,Mnjz Co,https://play-lh.googleusercontent.com/-swHHj9n...,تطبيق ممتاز و سهل و تلاقي اي طبخة بتدور عليها ...,5,0,,2021-04-02 01:55:42,,NaT
4,gp:AOqpTOEaMDo2b7B0taNo0jCO7_X98FvlmrUhYP1Yg-Y...,ปรายงค์ บวรนาถสกุล,https://play-lh.googleusercontent.com/-HHB-1fi...,Loving it. Coolest thing is the search by ingr...,5,0,2.194.1.0-android,2021-04-02 01:37:20,,NaT


In [187]:
app_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4400 entries, 0 to 4399
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              4400 non-null   object        
 1   userName              4400 non-null   object        
 2   userImage             4400 non-null   object        
 3   content               4400 non-null   object        
 4   score                 4400 non-null   int64         
 5   thumbsUpCount         4400 non-null   int64         
 6   reviewCreatedVersion  3961 non-null   object        
 7   at                    4400 non-null   datetime64[ns]
 8   replyContent          20 non-null     object        
 9   repliedAt             20 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 343.9+ KB


In [188]:
app_reviews_df['userName'].unique().size

4235

### Using a subset of the columns/attributes for this analysis as we are interested in review.
Keep:
* reviewId
* userName
* content
* score
* reviewCreatedVersion (there are some nulls here so handle them)
* at
* replyContent
* repliedAt

In [189]:
app_reviews_df.drop(['userImage', 'thumbsUpCount'], axis=1, inplace=True)
app_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4400 entries, 0 to 4399
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              4400 non-null   object        
 1   userName              4400 non-null   object        
 2   content               4400 non-null   object        
 3   score                 4400 non-null   int64         
 4   reviewCreatedVersion  3961 non-null   object        
 5   at                    4400 non-null   datetime64[ns]
 6   replyContent          20 non-null     object        
 7   repliedAt             20 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(5)
memory usage: 275.1+ KB


In [190]:
# check for duplicates where they might be troublesome or interesting
col_names=['reviewId', 'userName', 'content', 'at', 'replyContent', 'repliedAt']

for col_name in col_names:
    print(col_name)
    print(app_reviews_df[col_name].unique().size)

reviewId
4400
userName
4235
content
3655
at
4400
replyContent
21
repliedAt
21


In [191]:
# Rename the at column for clarity
app_reviews_df.rename(columns={'at':'reviewTimestamp'}, inplace=True)
# Set the reviewTimestamp to datetime
app_reviews_df['reviewTimestamp'] = pd.to_datetime(app_reviews_df['reviewTimestamp'])

# Create new columns - reviewYear as integer and reviewDate as datetime with format YYYY-MM
app_reviews_df['reviewYear'] = app_reviews_df.apply(lambda row: row['reviewTimestamp'].year, axis=1)
app_reviews_df['reviewMonth'] = app_reviews_df.apply(lambda row: row['reviewTimestamp'].month, axis=1)
app_reviews_df['reviewDateString'] = app_reviews_df.apply(lambda row: row['reviewTimestamp'].strftime('%Y-%m'), axis=1)

In [192]:


app_reviews_df.head()


Unnamed: 0,reviewId,userName,content,score,reviewCreatedVersion,reviewTimestamp,replyContent,repliedAt,reviewYear,reviewDateString
0,gp:AOqpTOGBV4KS2fN18xajDXPV38LK3nJ09sBbfcuLClS...,Lori Dumm,All So Awesome,5,2.194.1.0-android,2021-04-02 09:25:12,,NaT,2021,2021-04
1,gp:AOqpTOEAibGyA69S9nq9HcWFihQnZ-yCeUSCYRmMcs2...,Aisha Mahmoud,"Really, useful app❤",5,2.194.1.0-android,2021-04-02 08:06:59,,NaT,2021,2021-04
2,gp:AOqpTOHWPSxiQWVLOWula5YBNht1-bBp4BYa-kx2F1z...,Mia Mae,Bermanfaat sekali untuk yg baru belajar masak👍👍,5,2.182.1.0-android,2021-04-02 05:18:30,,NaT,2021,2021-04
3,gp:AOqpTOH0ihmhzPKVhqYJPMz0JzFdWC-B6VHVzOMmJMF...,Mnjz Co,تطبيق ممتاز و سهل و تلاقي اي طبخة بتدور عليها ...,5,,2021-04-02 01:55:42,,NaT,2021,2021-04
4,gp:AOqpTOEaMDo2b7B0taNo0jCO7_X98FvlmrUhYP1Yg-Y...,ปรายงค์ บวรนาถสกุล,Loving it. Coolest thing is the search by ingr...,5,2.194.1.0-android,2021-04-02 01:37:20,,NaT,2021,2021-04


In [194]:
# check for unique values in the categorical columns
col_names=['reviewCreatedVersion', 'score', 'reviewYear']

for col_name in col_names:
    print(col_name)
    print(app_reviews_df[col_name].unique().size)

reviewCreatedVersion
226
score
5
reviewYear
4


In [195]:
# Set any null reviewCreatedVersion to Unknown
app_reviews_df.fillna({'reviewCreatedVersion':'Unknown'}, inplace=True)
app_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4400 entries, 0 to 4399
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              4400 non-null   object        
 1   userName              4400 non-null   object        
 2   content               4400 non-null   object        
 3   score                 4400 non-null   int64         
 4   reviewCreatedVersion  4400 non-null   object        
 5   reviewTimestamp       4400 non-null   datetime64[ns]
 6   replyContent          20 non-null     object        
 7   repliedAt             20 non-null     datetime64[ns]
 8   reviewYear            4400 non-null   int64         
 9   reviewDateString      4400 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 343.9+ KB


In [196]:
app_reviews_df['reviewYear'].unique()

array([2021, 2020, 2019, 2018])

In [197]:
# Let's drop the reviews from 2018
app_reviews_df.drop(app_reviews_df[app_reviews_df['reviewYear'] == 2018].index, inplace=True)
app_reviews_df['reviewTimestamp'].min()

Timestamp('2019-01-01 04:27:45')

In [198]:
app_reviews_df['repliedAt'].unique()  # This looks odd...repliedAt timestamps are set for years before the earliest review date. ?

array([                          'NaT', '2021-01-28T03:20:10.000000000',
       '2020-10-30T11:09:42.000000000', '2020-09-17T03:19:41.000000000',
       '2020-08-12T05:08:49.000000000', '2018-01-29T02:50:11.000000000',
       '2016-08-04T15:38:35.000000000', '2020-06-18T09:17:21.000000000',
       '2017-07-03T04:55:21.000000000', '2019-11-14T23:47:54.000000000',
       '2017-08-10T04:48:36.000000000', '2016-05-13T11:06:33.000000000',
       '2015-12-13T14:41:11.000000000', '2018-09-17T05:05:23.000000000',
       '2019-02-28T00:56:45.000000000', '2019-07-01T02:40:11.000000000',
       '2019-06-11T04:47:44.000000000', '2019-04-23T08:43:52.000000000',
       '2019-03-21T06:23:54.000000000', '2017-02-01T08:18:28.000000000',
       '2019-01-21T21:19:38.000000000'], dtype='datetime64[ns]')

In [199]:
app_reviews_df.to_csv('../data/cookpad_recent_reviews.csv', index=None, header=True)