In [130]:
import awswrangler as wr
import pandas as pd
import plotly.express as px

In [131]:
pd.options.display.float_format = "{:.2f}".format

In [132]:
df_counts = wr.s3.read_parquet(path="s3://amazon-reviews-eafit/eda/counts/")

In [133]:
df_counts

Unnamed: 0,metric,value
0,Total Records,109745480
1,Filtered Records (No Nulls),109524497
2,Verified Records,86384963
3,Customers with three or more reviews count,67893558


In [134]:
print(
    f"Total de registros después de filtrar nulos: {df_counts.iloc[0]['value'] - df_counts.iloc[1]['value']}"
)
print(
    f"Total de registros después de filtrar la gente que no compró: {df_counts.iloc[1]['value'] - df_counts.iloc[2]['value']}"
)
print(
    f"Total de registros después de filtrar gente que ha hecho al menos 3 reviews: {df_counts.iloc[2]['value'] - df_counts.iloc[3]['value']}"
)

Total de registros después de filtrar nulos: 220983
Total de registros después de filtrar la gente que no compró: 23139534
Total de registros después de filtrar gente que ha hecho al menos 3 reviews: 18491405


In [135]:
df_summary_statistics = wr.s3.read_parquet(
    path="s3://amazon-reviews-eafit/eda/summary_statistics/"
)

In [136]:
df_summary_statistics

Unnamed: 0,metric,value
0,Distinct Products Count,11009564.0
1,Distinct Customers Count,7612393.0
2,Average Rating,4.26
3,Average Reviews per User,8.92
4,Average Reviews per Product,6.17


In [137]:
df_avg_ratings_per_category = wr.s3.read_parquet(
    path="s3://amazon-reviews-eafit/eda/avg_rating_per_category/"
)

In [138]:
df_avg_ratings_per_category

Unnamed: 0,category,avg_rating
0,digital_video_games,3.97
1,software,3.94
2,music,4.58
3,office_products,4.25
4,furniture,4.2
5,electronics,4.18
6,pc,4.22
7,digital_software,3.73
8,automotive,4.34
9,shoes,4.29


In [151]:
fig = px.bar(
    df_avg_ratings_per_category,
    x="category",
    y="avg_rating",
    title="Average Ratings by Category",
)

fig.update_traces(marker_color="blue")
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Category", tickangle=-45)
fig.update_yaxes(title_text="Average Rating")


# Display the histogram
fig.show()

In [152]:
df_rows_per_category_after_cleaning = wr.s3.read_parquet(
    path="s3://amazon-reviews-eafit/eda/records_per_category/"
)

In [153]:
df_rows_per_category_after_cleaning.sort_values(by="category")

Unnamed: 0,category,count
32,apparel,4212567
8,automotive,2537436
16,baby,1123022
10,beauty,3356403
14,books,209179
36,camera,1195962
30,digital_ebook_purchase,3122403
21,digital_music_purchase,1057557
7,digital_software,50963
19,digital_video_download,2088923


In [155]:
fig = px.bar(
    df_rows_per_category_after_cleaning,
    x="category",
    y="count",
    title="Counts by Category",
)

fig.update_traces(marker_color="blue")
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Category", tickangle=-45)
fig.update_yaxes(title_text="Counts")


# Display the histogram
fig.show()

In [142]:
df_total_rows_per_category_before_cleaning = wr.s3.read_parquet(
    path="s3://amazon-reviews-eafit/eda/total_rows_per_category/"
)

In [143]:
df_total_rows_per_category_before_cleaning.sort_values(by="category")

Unnamed: 0,category,count
1,apparel,5881740
2,automotive,3510834
3,baby,1749122
4,beauty,5094202
5,books,3105251
6,camera,1800802
7,digital_ebook_purchase,5100061
8,digital_music_purchase,1681414
36,digital_software,101832
9,digital_video_download,3998206


In [144]:
df_counts_customers_by_reviews = wr.s3.read_parquet(
    path="s3://amazon-reviews-eafit/eda/count_of_customers_by_reviews/"
)
df_counts_customers_by_reviews.sort_values(by="review_count", inplace=True)

In [145]:
df_counts_customers_by_reviews.head()

Unnamed: 0,review_count,user_count
534,1,10609061
535,2,3941172
536,3,1996516
537,4,1247269
538,5,845690


In [146]:
df_counts_customers_by_reviews.tail()

Unnamed: 0,review_count,user_count
529,1363,1
530,1381,1
531,1559,1
532,1598,1
533,1636,1


In [147]:
fig = px.bar(
    df_counts_customers_by_reviews,
    x="review_count",
    y="user_count",
    title="User Count by Review Count",
    labels={"review_count": "Review Count", "user_count": "User Count"},
)

# Update the bar color (optional)
fig.update_traces(marker_color="blue")

# Update the x-axis (optional)
fig.update_xaxes(title_text="Conteo de reviews", range=[1, 50])

# Update the y-axis (optional)
fig.update_yaxes(
    title_text="Frecuencia",
)

# Display the bar plot
fig.show()

In [150]:
df_counts_customers_by_reviews["cumulative_user_count"] = (
    df_counts_customers_by_reviews["user_count"].cumsum()
)

# Creating the cumulative frequency distribution plot
fig = px.line(
    df_counts_customers_by_reviews,
    x="review_count",
    y="cumulative_user_count",
    title="Cumulative Frequency Distribution of User Count by Review Count",
    labels={
        "review_count": "Review Count",
        "cumulative_user_count": "Cumulative User Count",
    },
    range_x=[2, 50],
)

# Display the cumulative frequency distribution plot
fig.show()