# Estimation of Top/Bottom most similar items by damped mean

In [None]:
import pandas as pd
import numpy as np

In [None]:

# Load the data
file_path = 'health_and_household_reviews'
df = pd.read_csv(file_path, sep='\t')

# Parameters
k = 25  # prior strength

# Calculate global mean
global_mean = df['rating:float'].mean()

# Calculate sum and count of ratings for each item
items_rating_sum = df.groupby('item_id:token')['rating:float'].sum()
items_rating_count = df.groupby('item_id:token')['rating:float'].count()

# Calculate the damped mean for each item
items_damped_mean = (items_rating_sum + k * global_mean) / (items_rating_count + k)

# Add the damped mean to the dataframe
df_items = pd.DataFrame({
    'item_id': items_damped_mean.index,
    'damped_mean_rating': items_damped_mean.values
})

# Sort items by damped mean rating
df_items_sorted = df_items.sort_values(by='damped_mean_rating', ascending=False)

# Select top 30% and bottom 30% of items
top_30_percent = df_items_sorted.head(int(len(df_items_sorted) * 0.25))
bottom_30_percent = df_items_sorted.tail(int(len(df_items_sorted) * 0.25))

top_30_percent_items = df_items_sorted.head(int(len(df_items_sorted) * 0.25))['item_id']
bottom_30_percent_items = df_items_sorted.tail(int(len(df_items_sorted) * 0.25))['item_id']

# # Print the results
# print("Top 30% best-rated items:")
# print(top_30_percent_items)

# print("\nBottom 30% worst-rated items:")
# print(bottom_30_percent_items)

In [None]:
# mean rating of the top 30% best-rated items
mean_top_30_percent = top_30_percent['damped_mean_rating'].mean()
print("\nMean rating of the top 25% best-rated items:", mean_top_30_percent)

mean_bottom_30_percent = bottom_30_percent['damped_mean_rating'].mean()
print("\nMean rating of the bottom 25% worst-rated items:", mean_bottom_30_percent)

In [None]:
# Filter the original DataFrame to include only the top and bottom items
top_30_percent_df = df[df['item_id:token'].isin(top_30_percent_items)]
bottom_30_percent_df = df[df['item_id:token'].isin(bottom_30_percent_items)]

In [None]:
# numper of reviews for the top 30% best-rated items
num_reviews_top_30_percent = top_30_percent_df.shape[0]
print("\nNumber of reviews for the top 30% best-rated items:", num_reviews_top_30_percent)

num_reviews_bottom_30_percent = bottom_30_percent_df.shape[0]
print("\nNumber of reviews for the bottom 30% worst-rated items:", num_reviews_bottom_30_percent)

In [None]:
# sample 5 million reviews each
top_30_percent_sample = top_30_percent_df.sample(n=5000000, random_state=42)
bottom_30_percent_sample = bottom_30_percent_df.sample(n=5000000, random_state=42)

In [None]:
# Save the filtered DataFrames to separate files
top_30_percent_sample.to_csv('pattern_top_25_percent_reviews', sep='\t', index=False)
bottom_30_percent_sample.to_csv('pattern_bottom_25_percent_reviews', sep='\t', index=False)