# Analysis of Amazon App Reviews from the Google Play Store
<br>

In [254]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [255]:
import warnings
warnings.filterwarnings('ignore')

In [256]:
data = pd.read_csv('amazon_reviews.csv')
df_pre = pd.DataFrame(data)

### Data Cleaning

In [257]:
print(df_pre.head(n=1))

                               reviewId          userName  \
0  d6416c16-c1eb-4b56-920f-77ef07b2d8cb  Sergio Hernandez   

                   content  score  thumbsUpCount reviewCreatedVersion  \
0  Bueneishon apliqueishon      5              0          26.17.2.100   

                    at   appVersion  
0  2024-05-09 15:51:12  26.17.2.100  


In [258]:
df_pre = df_pre.rename(columns = {'content': 'review'})
df_pre = df_pre.rename(columns = {'score': 'rating'})
df_pre = df_pre.rename(columns = {'reviewCreatedVersion': 'AppVersion'})
df_pre = df_pre.rename(columns = {'at': 'DateOfReview'})

df_pre = df_pre.drop(columns=['appVersion'])

df_pre['reviewLength'] = df_pre['review'].str.len()

In [259]:
for col in df_pre.columns:
    print(col)

reviewId
userName
review
rating
thumbsUpCount
AppVersion
DateOfReview
reviewLength


What each column shows:
- `reviewID`: ID of the review
- `userName`: Username of the reviewer
- `review`: The text the reviewer left as the review
- `rating`: The reviewer's rating from 1 (bad) to 5 (good)
- `thumbsUpCount`: # of likes given to a review by other users
- `AppVersion`: The version of the app the reviewer had installed 
- `DateOfReview`: The date and time when the review was posted
- `reviewLength`: The number of characters in the review

In [260]:
df_pre.describe()

Unnamed: 0,rating,thumbsUpCount,reviewLength
count,50213.0,50213.0,50213.0
mean,2.41079,13.748173,222.850895
std,1.637335,85.970094,135.558826
min,1.0,0.0,4.0
25%,1.0,0.0,118.0
50%,2.0,1.0,179.0
75%,4.0,3.0,311.0
max,5.0,5660.0,1594.0


In [261]:
print(df_pre.isnull().sum())

reviewId            0
userName            5
review              0
rating              0
thumbsUpCount       0
AppVersion       4805
DateOfReview        0
reviewLength        0
dtype: int64


In [262]:
df = df_pre.dropna()

print(df.isnull().sum())

reviewId         0
userName         0
review           0
rating           0
thumbsUpCount    0
AppVersion       0
DateOfReview     0
reviewLength     0
dtype: int64


In [263]:
df.dtypes

reviewId         object
userName         object
review           object
rating            int64
thumbsUpCount     int64
AppVersion       object
DateOfReview     object
reviewLength      int64
dtype: object

### Exploratory Data Analysis

In [264]:
# Distribution of Ratings
fig_ratings = px.histogram(df, x='rating', color_discrete_sequence=['darkgreen'], nbins=5, title='Distribution of Ratings')
fig_ratings.show()

As we can see from the plot above, most reviews are left either by users who found their experience with the app very bad (1-star) or by those who found their experience with the app great (5-star).
<br>
This makes sense, as it can be expected that those with good or bad experiences are more likely to post a review.

In [267]:
df_copy = df.copy()

# Define color order and map
color_order = [5, 4, 3, 2, 1]
color_map = {5: 'green', 4: 'blue', 3: 'yellow', 2: 'orange', 1: 'red'}

# Create histogram using the 'reviewLength' column
fig_length = px.histogram(df_copy, x='reviewLength', color='rating',
                          title='Distribution of Review Length (# of chars.) by Rating',
                          labels={'reviewLength': 'Review Length', 'rating': 'Rating'},
                          category_orders={'rating': color_order},
                          color_discrete_map=color_map)
fig_length.show()

As we can see from the plot above, the longest reviews are the ones left by reviewers whose experience with the app was the worst.

In [268]:
# Scatter plot of Review Length vs. Thumbs Up Count
fig_length_thumbs = px.scatter(df_copy, x='reviewLength', y='thumbsUpCount', color='rating',
                               title='Review Length vs. Thumbs Up Count',
                               labels={'review_length': 'Review Length', 'thumbsUpCount': 'Thumbs Up Count', 'rating': 'Rating'})

fig_length_thumbs.show()

As we can see from the plot above, the reviews with more likes tend to be those which give the app either the lowest rating or the highest rating. Once again, this can be expected, as users are likely to have shared concerns in the case of the 1 or 2 star reviews, and will tend to agree on what the app has done well in the case of the 5-star reviews.