In [1]:
import os
import re
import json
import gzip
import typing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.append('../')

from shared.utils.common import get_list_from_text_tuple, rotate_labels
from shared.model.data.features.engineering.recommendation import Recommendation

## Loading Data

In [None]:
df = pd.read_csv(
    '../shared/data/amz_products_small_pre_processed.csv.gz', 
    compression='gzip'
)

## Graph Analysis

#### Which are the categories of the recommended products?

We could consider to which degree each category is recommended between them.

In [None]:
df.also_buy.isnull().sum()

In [None]:
df = df.set_index('asin')

In [None]:
df.head()

In [None]:
also_view_not_null = df[~df.also_view.isnull()]
also_view_not_null.head()

In [None]:
also_view_value = also_view_not_null.also_view.values[0]
also_view_value

In [None]:
also_view_value_processed = get_list_from_text_tuple(also_view_value)

In [None]:
# So there are some products recommended for which we can't find its features
df[df.index.values == also_view_value_processed[0]]

In [None]:
df.reset_index().asin.isin(also_view_value_processed).sum()

Maybe it was bad luck, will test for other products recommendations.

In [None]:
found, total = 0, 0

for idx in range(1, 40):
    also_view_value = also_view_not_null.also_view.values[idx]
    also_view_value_processed = get_list_from_text_tuple(also_view_value)
    summed = df.reset_index().asin.isin(also_view_value_processed).sum()
    if summed > 0:
        print(idx, ': ', round((summed / len(also_view_value_processed))*100, 2))

    found += summed
    total += len(also_view_value_processed)

(found / total)*100

We can see that the vast majority of those recommended products are not in our database (only **~5%** can be found).

Will see how is with the also_buy column.

In [None]:
also_buy_not_null = df[~df.also_buy.isnull()]

In [None]:
found, total = 0, 0

for idx in range(1, 40):
    also_buy_value = also_buy_not_null.also_buy.values[idx]
    also_buy_value_processed = get_list_from_text_tuple(also_buy_value)
    summed = df.reset_index().asin.isin(also_buy_value_processed).sum()
    if summed > 0:
        print(idx, ': ', round((summed / len(also_view_value_processed))*100, 2))
    
    found += summed
    total += len(also_view_value_processed)

(found / total)*100

In this case, it happens the same (we can only identify **~4%** of the recommended products).

So I believe we do not have enough data to be able to answer the initial question. 

#### Could be relationship between amount of products recommended and category?

In [None]:
df_small = df[
    ['also_buy', 'also_view', 'main_cat']
].fillna('').sample(frac=0.1)   # no need to use all -> save computing power and might be good enough approximation
df_small.head()

In [None]:
from tqdm import tqdm

with tqdm(total=len(df_small)) as pbar:
    for idx, row in df_small.iterrows():
        also_buy_list = get_list_from_text_tuple(row['also_buy']) if row['also_buy'] else ''
        also_view_list = get_list_from_text_tuple(row['also_view']) if row['also_view'] else ''

        df_small.loc[idx, 'also_buy_count'] = len(also_buy_list)
        df_small.loc[idx, 'also_view_count'] = len(also_view_list)

        pbar.update(1)

df_small.also_buy_count = df_small.also_buy_count.astype('int')
df_small.also_view_count = df_small.also_view_count.astype('int')

In [None]:
chart = sns.boxplot(data=df_small, x='main_cat', y='also_buy_count')
rotate_labels(chart)
plt.ylim(0, 20)

In [None]:
chart = sns.boxplot(data=df_small, x='main_cat', y='also_view_count')
rotate_labels(chart)
plt.ylim(0, 20)

So we can see that for some categories there is a tendency to have more recommended products than in other categories.

We can validate this by creating ANOVA test.

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Define the ANOVA model
model = ols('also_view_count ~ main_cat', data=df_small).fit()

# Perform ANOVA test
anova_table = sm.stats.anova_lm(model, typ=2)

# Print the anova table
print(anova_table)

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Define the ANOVA model
model = ols('also_buy_count ~ main_cat', data=df_small).fit()

# Perform ANOVA test
anova_table = sm.stats.anova_lm(model, typ=2)

# Print the anova table
print(anova_table)

A feature for the final model could be the amount of items recommended to view and buy (integer).

In [None]:
df_small.describe()

In [None]:
min_also = .0
max_also_buy_count = df_small.also_buy_count.quantile(.95)
max_also_view_count = df_small.also_view_count.quantile(.95)
print(max_also_buy_count, max_also_view_count)

We will limit the minmax scaler to those values, to handle also the outliers somehow.

In [None]:
also_buy_recommendation = Recommendation(max_value=59)

In [None]:
df_small.head(20)

In [None]:
also_view_example_value = df_small.also_view.values[16]
also_view_example_value

In [None]:
also_buy_recommendation.get_feature(also_view_example_value)