## Prepare Review Data
* Prepare the collected data for analysis.

In [59]:
# Import the required libraries
import pandas as pd
import numpy as np

In [60]:
#  Functions for Duplicate checks 
def get_exact_dups(df):
    '''
    Returns duplicates
    '''
    dups = df[df.duplicated()]
    return dups

def get_review_text_dups(df, col_names):
    '''
    Returns duplicates based on given column name
    '''
    dups = df[df.duplicated(subset=col_names)]
    return dups

In [61]:
reviews_df = pd.read_csv('../data/elvie_trustpilot_reviews.csv', index_col=0)
reviews_df.head()

Unnamed: 0_level_0,star_rating,location,datetime,content
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60cfb8f9f9f48706707b8f10,5 stars: Excellent,US,2021-06-20T21:54:01+00:00,I had a problem ...
60cfab60f9f48706707b88d8,5 stars: Excellent,US,2021-06-20T20:56:00+00:00,Amazing Ama...
60cf6daaf9f48703b0f3ad15,5 stars: Excellent,US,2021-06-20T16:32:42+00:00,Amazing customer service ...
60cf6d5ff9f48703b0f3ace6,5 stars: Excellent,GB,2021-06-20T16:31:27+00:00,Very quick responses ...
60cf359bf9f48703b0f388c3,5 stars: Excellent,AU,2021-06-20T12:33:31+00:00,Quick and helpful responses ...


In [66]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 60cfb8f9f9f48706707b8f10 to 5e087630c845450914ada0d0
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   star_rating  1500 non-null   object             
 1   location     1500 non-null   object             
 2   datetime     1500 non-null   datetime64[ns, UTC]
 3   content      1500 non-null   object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 58.6+ KB


In [67]:
# check for duplicates where they might be troublesome or interesting
print(f"total review count {reviews_df.shape[0]}")

col_names=['content']
for col_name in col_names:
    print(f"total unique {col_name}: {reviews_df[col_name].unique().size}")


total review count 1500
total unique content: 1493


In [68]:
# DATA TYPE CONVERSIONS
#
# Set the datetime column to datetime data type
reviews_df['datetime'] = pd.to_datetime(reviews_df['datetime'])


In [69]:
# NEW COLUMN CREATION
#
# Create new date solumns - reviewYear as integer and reviewDate as string with format YYYY-MM
reviews_df['reviewYear'] = reviews_df.apply(lambda row: row['datetime'].year, axis=1)
reviews_df['reviewYearMonth'] = reviews_df.apply(lambda row: row['datetime'].strftime('%Y-%m'), axis=1)

# Create simplified rating column - the first char of rating can be cast to integer 1-5
reviews_df['rating'] = reviews_df.apply(lambda row: row['star_rating'][0], axis=1)

In [70]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 60cfb8f9f9f48706707b8f10 to 5e087630c845450914ada0d0
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   star_rating      1500 non-null   object             
 1   location         1500 non-null   object             
 2   datetime         1500 non-null   datetime64[ns, UTC]
 3   content          1500 non-null   object             
 4   reviewYear       1500 non-null   int64              
 5   reviewYearMonth  1500 non-null   object             
 6   rating           1500 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 93.8+ KB


In [71]:
reviews_df.head()

Unnamed: 0_level_0,star_rating,location,datetime,content,reviewYear,reviewYearMonth,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
60cfb8f9f9f48706707b8f10,5 stars: Excellent,US,2021-06-20 21:54:01+00:00,I had a problem ...,2021,2021-06,5
60cfab60f9f48706707b88d8,5 stars: Excellent,US,2021-06-20 20:56:00+00:00,Amazing Ama...,2021,2021-06,5
60cf6daaf9f48703b0f3ad15,5 stars: Excellent,US,2021-06-20 16:32:42+00:00,Amazing customer service ...,2021,2021-06,5
60cf6d5ff9f48703b0f3ace6,5 stars: Excellent,GB,2021-06-20 16:31:27+00:00,Very quick responses ...,2021,2021-06,5
60cf359bf9f48703b0f388c3,5 stars: Excellent,AU,2021-06-20 12:33:31+00:00,Quick and helpful responses ...,2021,2021-06,5


In [72]:
reviews_df.to_csv('../data/elvie_trustpilot_prepped_reviews.csv', index=True, header=True)