<a href="https://colab.research.google.com/github/kr5red/automated-customer-reviews/blob/main/model3_new_ki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Data & Modules

In [None]:
import pandas as pd
import numpy as np

import pickle
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path

import data_prep
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)



df_resampled = data_prep.make_dataframe()

# ADDING SENTIMENT AND CATEGORY COLUMNS
with open("data/sentiment_columns.pkl", "rb") as f:
        sentiment_columns = pickle.load(f)

with open("data/category_columns.pkl", "rb") as f:
        category_columns = pickle.load(f)


#with open("data/embedding_columns.pkl", "rb") as f:
#        embedding_columns = pickle.load(f)


df_resampled = df_resampled.merge(sentiment_columns, on="new_id", how="left")

df_resampled = df_resampled.merge(category_columns, on="new_id", how="left")

#df_resampled = df_resampled.merge(embedding_columns, on="new_id", how="left")

df_resampled['predicted_product_category'].unique()
#4 ' E-readers', 'Tablets', ' Batteries', ' Smart Home Devices']
#6 E-readers', 'Tablets', ' Batteries', ' Kids Electronics',' Smart Speakers', ' Streaming Devices'

array([' E-readers', 'Tablets', ' Batteries', ' Kids Electronics',
       ' Smart Speakers', ' Streaming Devices'], dtype=object)

In [None]:
#df_view = df_resampled.drop(columns=["sourceURLs", "imageURLs", "keys"])
#df_view.head(1)

#df_view['reviews.didPurchase'].value_counts()
#df_view['reviews.doRecommend'].value_counts()

#df_view.reviews.numHelpful.sum()
#df_view.reviews.rating.sum()

#df_view.reviews.rating.value_counts()
#df_view['id'].value_counts()
#df_view['name'].value_counts()

#df_resampled[df_resampled['id']=='AV1YE_muvKc47QAVgpwE'].head(100)

#### Pivots of Best and Worst Producst

In [None]:


out = (
    df_resampled
    .groupby(["predicted_product_category", "id"])
    .agg(
        # existing
        count_new_id = ("new_id", "count"),
        count_positive = ("sentiment", lambda x: (x == "positive").sum()),
        count_negative = ("sentiment", lambda x: (x == "negative").sum()),
        count_neutral  = ("sentiment", lambda x: (x == "neutral").sum()),

        # new fields
        count_didPurchase  = ("reviews.didPurchase", "sum"),
        count_doRecommend  = ("reviews.doRecommend", "sum"),
        sum_numHelpful     = ("reviews.numHelpful", "sum"),
        sum_rating         = ("reviews.rating", "mean"),
        #first_name = ("name", "first"),
        #longest_name = ("name", lambda x: max(x, key=len)),
        brand_value = ("brand", "first"),

        # longest name OR fallback to brand
        longest_name = (
            "name",
            lambda x: (
                max(x.dropna().astype(str), key=len)
                if x.dropna().size > 0 and max(x.dropna().astype(str), key=len) != ""
                else None   # placeholder, will be fixed after .agg()
            )
        ),


    )
    .reset_index()
)
out = out[out['count_new_id'] >= 10].reset_index(drop=True)

out["count_new_id_by_category"] = (
    out.groupby("predicted_product_category")["count_new_id"]
       .transform("sum")
)

out["count_do_recommend_by_category"] = (
    out.groupby("predicted_product_category")["count_doRecommend"]
       .transform("sum")
)

out['positive_sentiment_ratio'] = out['count_positive'] / out['count_new_id']
out['negative_sentiment_ratio'] = out['count_negative'] / out['count_new_id']
out['neutral_sentiment_ratio'] = out['count_neutral'] / out['count_new_id']

out['sentiment_score'] = out['positive_sentiment_ratio'] - out['negative_sentiment_ratio']
out['rating_score'] = out['sum_rating']/5

out['frequency_score'] = out['count_new_id'] / out['count_new_id_by_category']

out['recommendation_score'] = out['count_doRecommend'] / out['count_do_recommend_by_category']


out['total_score_1'] =0.35 * out['sentiment_score'] + 0.35 * out['rating_score'] + 0.15 * out['frequency_score'] + 0.15 * out['recommendation_score']
out['total_score_2'] =0.4 * out['sentiment_score'] + 0.4 * out['rating_score'] + 0 * out['frequency_score'] + 0.2 * out['recommendation_score']


out["best_rank_in_category"] = (
    out.groupby("predicted_product_category")["total_score_1"]
      .rank(method="dense", ascending=False).astype(int)
)

out["worst_rank_in_category"] = (
    out.groupby("predicted_product_category")["total_score_1"]
      .rank(method="dense", ascending=True).astype(int)
)

front_cols = ['best_rank_in_category', 'worst_rank_in_category', 'total_score_1', 'total_score_2']
cols = front_cols + [col for col in out.columns if col not in front_cols]
out = out[cols].sort_values(by=['predicted_product_category','total_score_1'], ascending=False).reset_index(drop=True)
out['category_id'] = out['predicted_product_category'] + '_' + out['id']

best_products = out[out['best_rank_in_category']<=3].sort_values(by=['predicted_product_category','best_rank_in_category'], ascending=True).reset_index(drop=True)
worst_products = out[out['worst_rank_in_category']<=3].sort_values(by=['predicted_product_category','worst_rank_in_category'], ascending=True)
worst_products = worst_products[~worst_products['category_id'].isin(best_products['category_id'])].reset_index(drop=True)
df_summary = df_resampled.copy()

### Adding the Reviews to Best and Worst Products

In [None]:
df_summary['category_id'] = df_summary['predicted_product_category'] + '_' + df_summary['id']

df_summary_best = df_summary[df_summary['category_id'].isin(best_products.category_id.unique())]
df_summary_worst = df_summary[df_summary['category_id'].isin(worst_products.category_id.unique())]


def make_summary_strings(products_df,
summary_df,
positive_label="positive",
negative_label="negative"):


    summary_strings = []

    for row in products_df.itertuples(index=False):
        col1 = row.category_id

        # Sentiment proportions
        col2 = row.positive_sentiment_ratio / (
            row.positive_sentiment_ratio + row.negative_sentiment_ratio
        )
        col3 = row.negative_sentiment_ratio / (
            row.positive_sentiment_ratio + row.negative_sentiment_ratio
        )

        # Get positive subset
        df_subset_pos = (
            summary_df
            .loc[(summary_df['category_id'] == col1) &
                 (summary_df['sentiment'] == positive_label)]
            .sort_values(by='reviews.numHelpful', ascending=False)
            .head(int(20 * col2))
        )

        # Get negative subset
        df_subset_neg = (
            summary_df
            .loc[(summary_df['category_id'] == col1) &
                 (summary_df['sentiment'] == negative_label)]
            .sort_values(by='reviews.numHelpful', ascending=False)
            .head(int(20 * col3))
        )

        # selected indexes
        selected_idx = list(df_subset_pos.index) + list(df_subset_neg.index)

        # extract & join text
        review_texts = summary_df.loc[selected_idx, "name_title_text"].tolist()
        summary_string = " ".join(review_texts)

        summary_strings.append(summary_string)

    # Add new column (name auto-handled)
    products_df["summary_reviews_string"] = summary_strings

    cols = ['category_id', 'predicted_product_category', 'id', 'count_new_id', 'brand_value', 'longest_name', 'summary_reviews_string']
    products_df = products_df[cols]


    return products_df

best_products = make_summary_strings(best_products, df_summary_best)
worst_products = make_summary_strings(worst_products, df_summary_worst)

# use column "summary_reviews_string" to generate a summary for the product