In [13]:
from Utils.dataset import Dataset, ContentWiseImpressions, read_dataset

In [14]:
from Utils.dataset import Dataset, ContentWiseImpressions, read_dataset
from SerialContentAnalysisFunctions import *
DATASET_VARIANT = ContentWiseImpressions.Variant.CW10M
dataset: ContentWiseImpressions = read_dataset(DATASET_VARIANT, use_items=False)

In [None]:
dataset_path = "data/ContentWiseImpressions/CW10M-CSV/interactions.csv"
d = pd.read_csv(dataset_path)

In [None]:
d.series_id.nunique()

In [20]:
import gc
import os
from typing import Dict, Tuple
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
import seaborn as sns

__STYLE = "whitegrid"
__CONTEXT = "paper"  # change to "paper" when creating figures for the paper
__FIG_SIZE_WIDTH = (
    16
    if __CONTEXT == "paper"
    else 20
)
__FIG_SIZE_HEIGHT = (
    9
    if __CONTEXT == "paper"
    else 20
)
__FIG_DPI = 150

sns.set_context(__CONTEXT)
sns.set_style(__STYLE)



def _get_urm_plots_filenames(
        plot_filepath: str,
        norm_plot_filepath: str
) -> Tuple[str, str]:
    urm_heatmap_with_means_filename = os.path.join(
        plot_filepath,
        f"urm_with_means.png"
    )
    norm_urm_heatmap_with_means_filename = os.path.join(
        norm_plot_filepath,
        f"urm_with_means.png"
    )

    return urm_heatmap_with_means_filename, norm_urm_heatmap_with_means_filename


def urm_plots_exists(
        plot_path: str,
        norm_plot_path: str,
) -> bool:
    filename, norm_filename = _get_urm_plots_filenames(
        plot_filepath=plot_path,
        norm_plot_filepath=norm_plot_path,
    )

    return os.path.exists(norm_filename) and os.path.exists(filename)


def generate_urm_heatmap_plot(
        urm: sp.csr_matrix,
        user_popularity: np.ndarray,
        item_popularity: np.ndarray,
        plot_path: str,
        norm_plot_path: str,
        
) -> None:
    """
         The plot is expected to be something like this. It is divided in a 4x3 grid where
          * The URM heatmap color-bar goes in 0,2

          * The URM heatmap goes in 1,2

          * The URM User-Popularity Boxplot goes in 1,0
          * The URM User-Popularity Scatter plot goes in 1,1
          * The URM Item-Popularity Boxplot goes in 2,2
          * The URM Item-Popularity Scatter plot goes in 3,2

          * E represent empty cells of the map.

               0           1           2       
           ------------------------------------
         0 |   E      |   E      |  URM       |
           |   E      |   E      |  color-bar |
           |__________|__________|____________|
           | User-Pop | User-Pop |  URM       |
           | Boxplot  | Scatter  |  Heatmap   |
           |          |          |            |
         1 |          |          |            |
           |          |          |            |
           |          |          |            |
           |__________|__________|____________|
         2 |   E      |   E      |  Item-Pop  |
           |   E      |   E      |   Scatter  |
           |__________|__________|____________|
         3 |   E      |   E      |  Item-Pop  |
           |   E      |   E      |   Boxplot  |
           |__________|__________|____________|
    """
    (
        urm_heatmap_with_means_filename,
        norm_urm_heatmap_with_means_filename
    ) = _get_urm_plots_filenames(
        plot_filepath=plot_path,
        norm_plot_filepath=norm_plot_path,
    )

    for normalize in [True, False]:
        if normalize and os.path.exists(norm_urm_heatmap_with_means_filename):
            continue

        if not normalize and os.path.exists(urm_heatmap_with_means_filename):
            continue

        num_rows = 4
        num_cols = 3
        height_rows_ratios = [5, 75, 10, 10]
        width_cols_ratios = [10, 10, 80]

        fig: plt.Figure = plt.figure(
            figsize=(__FIG_SIZE_WIDTH, __FIG_SIZE_WIDTH),
            dpi=__FIG_DPI
        )
        gs = plt.GridSpec(
            nrows=num_rows,
            ncols=num_cols,
            figure=fig,
            height_ratios=height_rows_ratios,
            width_ratios=width_cols_ratios,
        )

        ax_urm_heatmap_color_bar: plt.Axes = fig.add_subplot(gs[0, 2])

        ax_urm_heatmap: plt.Axes = fig.add_subplot(
            gs[1, 2]
        )

        ax_urm_user_popularity_boxplot: plt.Axes = fig.add_subplot(
            gs[1, 0],
        )
        ax_urm_user_popularity_scatter: plt.Axes = fig.add_subplot(
            gs[1, 1],
            sharey=ax_urm_heatmap
        )

        ax_urm_item_popularity_scatter: plt.Axes = fig.add_subplot(
            gs[2, 2],
            sharex=ax_urm_heatmap
        )
        ax_urm_item_popularity_boxplot: plt.Axes = fig.add_subplot(
            gs[3, 2],
        )

        sort_urm_and_item_weights_by_popularity = True
        if sort_urm_and_item_weights_by_popularity:
            popular_user_indices_desc = np.flip(np.argsort(user_popularity))
            popular_item_indices_desc = np.flip(np.argsort(item_popularity))

            urm = urm[popular_user_indices_desc, :][:, popular_item_indices_desc]
            user_popularity = user_popularity[popular_user_indices_desc]
            item_popularity = item_popularity[popular_item_indices_desc]

        plot_objects = [
            [
                ax_urm_heatmap_color_bar, ax_urm_heatmap, urm.toarray(), None, None,
                "User-Rating Matrix",
                ax_urm_user_popularity_boxplot, user_popularity, "User Popularity",
                ax_urm_user_popularity_scatter, user_popularity, "User Popularity",
                ax_urm_item_popularity_boxplot, item_popularity, "Item Popularity",
                ax_urm_item_popularity_scatter, item_popularity, "Item Popularity",
            ],
        ]

        num_users, num_items = urm.shape
        for objects in plot_objects:
            (
                ax_heatmap_color_bar, ax_heatmap, heatmap_data, heatmap_min, heatmap_max, heatmap_title,
                ax_user_boxplot, user_boxplot_data, user_boxplot_title,
                ax_user_scatter, user_scatter_data, user_scatter_title,
                ax_item_boxplot, item_boxplot_data, item_boxplot_title,
                ax_item_scatter, item_scatter_data, item_scatter_title,
            ) = objects

            sns.heatmap(
                data=heatmap_data,
                ax=ax_heatmap,
                cmap="YlGnBu",
                cbar_ax=ax_heatmap_color_bar,
                cbar_kws={"orientation": "horizontal"},
                vmin=heatmap_min,
                vmax=heatmap_max,
            )

            sns.boxplot(
                x=user_boxplot_data,
                color="orange",
                ax=ax_user_boxplot,
            )
            sns.scatterplot(
                y=np.arange(num_users),
                x=user_scatter_data,
                color="orange",
                ax=ax_user_scatter,
            )

            sns.boxplot(
                y=item_boxplot_data,
                color="red",
                ax=ax_item_boxplot,
            )
            sns.scatterplot(
                x=np.arange(num_items),
                y=item_scatter_data,
                color="red",
                ax=ax_item_scatter,
            )

            ax_heatmap.set_xlabel("Item Ids")
            ax_heatmap.set_ylabel("User Ids")

            ax_user_boxplot.tick_params(labelleft=False, labelright=False)
            ax_user_scatter.tick_params(labelleft=False, labelright=False)

            ax_item_boxplot.tick_params(labeltop=False, labelbottom=False)
            ax_item_scatter.tick_params(labeltop=False, labelbottom=False)

            ax_heatmap.set_title(heatmap_title)

            ax_user_boxplot.set_title(user_boxplot_title)
            ax_user_scatter.set_title(user_scatter_title)

            ax_item_boxplot.set_title(item_boxplot_title)
            ax_item_scatter.set_title(item_scatter_title)

        plot_title = (
            "Normalized URM Visualization"
            if normalize
            else "URM Visualization"
        )
#         for key, value in plot_title_extras.items():
#             plot_title += f"\n* {key}={value}"

        fig.suptitle(
            t=plot_title
        )
        fig.tight_layout()

        plt.savefig(
            norm_urm_heatmap_with_means_filename
            if normalize
            else urm_heatmap_with_means_filename
        )
    
        fig.clear()
        plt.close(fig=fig)
    
        gc.collect()

In [22]:
user_popularity = np.ediff1d(dataset.URM["train"].indptr)
item_popularity = np.ediff1d(dataset.URM["train"].tocsc().indptr)


In [None]:
generate_urm_heatmap_plot(dataset.URM["train"], user_popularity, item_popularity,
                         "/plots", "/plots")

In [18]:
dataset.URM["train"]

<38875x18279 sparse matrix of type '<class 'numpy.int32'>'
	with 532985 stored elements in Compressed Sparse Row format>

In [3]:
urm_extended = elasticnet_URM_train_with_bingewatching(dataset.URM["train"])

/home/matteo/Desktop/serial-content-analysis/parquets/ts_differential_df_50_4hours.parquet
Loading existing parquet file in parquets/ts_differential_df_50_4hours_train.parquet
Df_bw loaded


In [4]:
urm = urm_extended.todense()
urm.shape

(38876, 18280)

In [12]:
binge_worthy = urm[-1, :]
binge_worthy[binge_worthy > 0]

matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       dtype=int64)

In [11]:
binge_watchers = urm[:, -1]
binge_watchers[binge_watchers > 0]

matrix([[1, 1, 1, ..., 1, 1, 1]], dtype=int64)

In [7]:
from scipy import stats
stats.describe(binge_watchers)

DescribeResult(nobs=38876, minmax=(array([0], dtype=int64), array([1], dtype=int64)), mean=array([0.02711184]), variance=array([0.02637747]), skewness=array([5.82341119]), kurtosis=array([31.91211785]))

In [8]:
stats.describe(binge_watchers[binge_watchers > 0].T)

DescribeResult(nobs=1054, minmax=(array([1], dtype=int64), array([1], dtype=int64)), mean=array([1.]), variance=array([0.]), skewness=array([0.]), kurtosis=array([-3.]))

In [9]:
stats.describe(binge_worthy.T)

DescribeResult(nobs=18280, minmax=(array([0], dtype=int64), array([1], dtype=int64)), mean=array([0.01247265]), variance=array([0.01231775]), skewness=array([8.78567406]), kurtosis=array([75.18806878]))

In [10]:
stats.describe(binge_worthy[binge_worthy > 0].T)

DescribeResult(nobs=228, minmax=(array([1], dtype=int64), array([1], dtype=int64)), mean=array([1.]), variance=array([0.]), skewness=array([0.]), kurtosis=array([-3.]))

In [None]:
stats.describe(dataset.URM["train"].todense())

In [None]:
stats.describe(dataset.URM["validation"].todense())

In [None]:
stats.describe(dataset.URM["test"].todense())

In [None]:
urm.sum(axis=0)
URM_train = dataset.URM["train"]
URM_train

In [None]:
dataset.URM["test"]

In [None]:
dataset.URM["validation"]

In [None]:
df_bw = get_timestamp_differential_dataframe(df=None, percentage=50, session_threshold_hours=4, train=True)


In [None]:
def _read_dictionary(local_file_path: str) -> dict:
    with open(local_file_path, "r") as f:
        return json.load(f)

translation_user_id_index_urm = _read_dictionary(os.path.join(os.getcwd(),
                                       "data",
                                       "ContentWiseImpressions",
                                       "CW10M",
                                       "translation_user_id_index_urm.json"))
translation_series_id_index_urm = _read_dictionary(os.path.join(os.getcwd(),
                                       "data",
                                       "ContentWiseImpressions",
                                       "CW10M",
                                       "translation_series_id_index_urm.json"))

user_indices = np.unique(df_bw.user_id.values, return_index=True)[1]
# bingewatchers = {translation_user_id_index_urm[user]: bw for user, bw in zip([df_bw.user_id.values[index] for index in sorted(user_indices)],
#                                               np.array(df_bw.groupby("user_id").sum().n_bingewatching_sessions_3_to_7.values,
#                                                        dtype=int))}



In [None]:
translation_user_id_index_urm["42151"]

In [None]:
df_bw

In [None]:
bingewatchers = {}
for user, bw in zip([df_bw.user_id.values[index] for index in sorted(user_indices)], np.array(df_bw.groupby("user_id").sum().n_bingewatching_sessions_3_to_7.values, dtype=int)):
    print(translation_user_id_index_urm[user], bingewatchers[translation_user_id_index_urm[user]])
    bingewatchers[translation_user_id_index_urm[user]] = bw
    print(translation_user_id_index_urm[user], bingewatchers[translation_user_id_index_urm[user]])
URM_bingewatchers_column_array = np.zeros((URM_train.shape[0]+1, 1), dtype=int)

for k, i in bingewatchers.items():
    URM_bingewatchers_column_array[k] = i



In [None]:
# Generate bingeworthy row to be added at the bottom of the URM
series_indices = np.unique(df_bw.series_id.values, return_index=True)[1]

bingeworthy_series = {translation_series_id_index_urm[series]: bw for series, bw in zip([df_bw.series_id.values[index] for index in sorted(series_indices)],
                                              np.array(
                                                  df_bw.groupby("series_id").sum().n_bingewatching_sessions_3_to_7.values,
                                                  dtype=int))
                    if bw != 0}

URM_bingeworthy_row_array = np.zeros((1, URM_train.shape[1]), dtype=int)

for k, i in bingeworthy_series.items():
    URM_bingeworthy_row_array[k] = i

In [None]:
d = dataset.interactions
d

In [None]:
# print(dataset.metadata)
# dataset.metadata["num_items"] = dataset.interactions.item_id.nunique()
# dataset.metadata["num_recommendations"] = dataset.interactions.recommendation_id.nunique()
# dataset.metadata["num_series"] = dataset.interactions.user_id.nunique()
# dataset.metadata["num_users"] = dataset.interactions.series_id.nunique()
# print(dataset.metadata)
# dataset.save_metadata()

In [None]:
dataset.interactions = clean_dataset_CW_class(dataset.interactions)
dataset.interactions

In [None]:
d = dataset.interactions
d

In [None]:
d[(d["interaction_type"] == 0) & (d["vision_factor"] >= 0.9)]

In [None]:
difference_after_filtering(d, d[(d["interaction_type"] == 0) & (d["vision_factor"] >= 0.9)])

In [None]:
d = d[(d["interaction_type"] == 0) & (d["vision_factor"] >= 0.9)]
d

In [None]:
train, validation, test = user_temporal_split(d)

In [None]:
get_dataset_statistics(train)

In [None]:
get_dataset_statistics(test)

In [None]:
get_dataset_statistics(validation)

In [None]:
get_dataset_statistics(d)

In [None]:
len(set(test.user_id) - set(train.user_id))

In [None]:
len(test[(test["user_id"].isin(train.user_id)) & (test["series_id"].isin(train.series_id))])

In [None]:
len(validation[(validation["user_id"].isin(train.user_id) & (validation["series_id"].isin(train.series_id)))])

In [None]:
len(test[test["user_id"].isin(train.user_id)])

In [None]:
d.user_id.nunique(), d.series_id.nunique()

In [None]:
train.user_id.nunique(), train.series_id.nunique()

In [None]:
validation.user_id.nunique(), validation.series_id.nunique()

In [None]:
test.user_id.nunique(), test.series_id.nunique()

In [None]:
import dask

In [None]:
num_user, num_items = train.user_id.nunique(), train.series_id.nunique()

In [None]:
URM_train = np.zeros(shape=(num_user, num_items))
URM_train.shape

In [None]:
train_chunks = separate_dataset_into_chunks(train)


In [None]:
train_chunks[1000]

In [None]:
max(train.user_id.values)

In [None]:
len(np.unique(train.user_id.values))

### Resort table

In [None]:
u_ids, s_ids = np.unique(train.user_id.values), np.unique(train.series_id.values)
URM_index_user_id_correspondence_dict = {u_ids[i]: i for i in range(len(u_ids))}
URM_index_series_id_correspondence_dict = {s_ids[i]: i for i in range(len(s_ids))}


In [None]:

for chunk in tqdm(train_chunks):
    URM_train[URM_index_user_id_correspondence_dict[chunk.user_id.values[0]], URM_index_series_id_correspondence_dict[chunk.series_id.values[0]]] = len(chunk)

In [None]:
for k, v in URM_index_series_id_correspondence_dict.items():
    if v == 18089:
        s = k
print(s)
for k, v in URM_index_user_id_correspondence_dict.items():
    if v == 38874:
        u = k
train[(train["user_id"] == u) & (train["series_id"] == s)]


In [None]:
URM_train[38874, 18089]

In [None]:
np.argwhere(URM_train > 0 )

In [None]:
from SerialContentAnalysisFunctions import *

In [None]:
dataset_path = "data/ContentWiseImpressions/CW10M-CSV/interactions.csv"
df = pd.read_csv(dataset_path)

In [None]:
df = clean_dataset_CW_class(df)

In [None]:
df_50_4h = get_timestamp_differential_dataframe(df=df, percentage=50, session_threshold_hours=2, train=True, store=True)


In [None]:
df_50_2h = get_timestamp_differential_dataframe(df=df, percentage=50, session_threshold_hours=4, train=True, store=True)


In [None]:
"""
Add to the URM train of SLIM ElasticNet one row vector and one column vector:
- row vector contains the bingeworthiness of series j
- column vector contains if the user i is a bingewatcher or not
:param dataset: data
:param URM_train:
:param percentage_watched:
:param hour_threshold:
:return:
"""
# Retrieve corresponding df bw
# if dataset is not None:
# df_bw = get_timestamp_differential_dataframe(df=dataset, percentage=percentage_watched, session_threshold_hours=hour_threshold, train=True)
df_bw = df_50_2h
# else:
#     df_bw = get_timestamp_differential_dataframe(percentage=percentage_watched, session_threshold_hours=hour_threshold, use_items=use_items)
print("Df_bw loaded")
# Generate bingewatchers column to be addedat the right of the URM
user_indices = np.unique(df_bw.user_id.values, return_index=True)[1]

bingewatchers = {user: bw for user, bw in zip([df_bw.user_id.values[index] for index in sorted(user_indices)],
                                              np.array((df_bw.groupby("user_id").sum().n_bingewatching_sessions_3_to_7 /
                                                        df_bw.groupby("user_id").sum().n_sessions).values ,dtype=np.float16))
                 if bw != 0}

URM_bingewatchers_column_array = np.zeros((URM_train.shape[0]+1, 1), dtype=int)

for k, i in bingewatchers.items():
    URM_bingewatchers_column_array[k] = i

print(URM_bingewatchers_column_array)
# Generate bingeworthy row to be added at the bottom of the URM
series_indices = np.unique(df_bw.series_id.values, return_index=True)[1]

bingeworthy_series = {series: bw for series, bw in zip([df_bw.series_id.values[index] for index in sorted(series_indices)],
                                              np.array(
                                                  (df_bw.groupby("series_id").sum().n_bingewatching_sessions_3_to_7 /
                                                   df_bw.groupby("series_id").sum().n_sessions).values,
                                                  dtype=np.float16))
                        if bw != 0}

URM_bingeworthy_row_array = np.zeros((1, URM_train.shape[1]), dtype=int)

for k, i in bingeworthy_series.items():
    URM_bingeworthy_row_array[k] = i
print(URM_bingeworthy_row_array)

# URM_train = URM_train.todense()
# URM_train = np.concatenate((URM_train, URM_bingeworthy_row_array), axis=0)
# URM_train = np.concatenate((URM_train, URM_bingewatchers_column_array), axis=1)

In [None]:
df_50_1h = get_timestamp_differential_dataframe(df=df, percentage=50, session_threshold_hours=1, train=True, store=True)
df_50_3h = get_timestamp_differential_dataframe(df=df, percentage=50, session_threshold_hours=3, train=True, store=True)
df_50_6h = get_timestamp_differential_dataframe(df=df, percentage=50, session_threshold_hours=6, train=True, store=True)

In [None]:
np.any(np.sum(dataset.URM["train"].todense())) == 0 

In [None]:
dataset.interactions[:int(len(dataset.interactions)*0.7)]

In [None]:
import numpy as np
a, b, c = np.split(dataset.interactions)


In [None]:
a

In [None]:
b

In [None]:
c

In [None]:
import numpy as np

from lightfm.datasets import fetch_stackexchange

data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']


In [None]:
item_features = data['item_features']
tag_labels = data['item_feature_labels']


In [None]:
min(list(np.sum(item_features.todense(), axis=0)))

In [None]:
a = np.zeros((1, 28881))
a

In [None]:
d = {0: 1, 10230: 1}
a[0][[k for k in d.keys()]] = 1
a