In [None]:
import os
import sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir("../..")

In [None]:
import logging
import pickle

import matplotlib.pyplot as plt
import pandas as pd
from hydra import compose, initialize

plt.style.use("ggplot")

from scipy.stats import kurtosis, skew
from src.preprocessing import ClassicDataset
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from adjustText import adjust_text
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import umap

In [None]:
# Paths and file names
config_path = os.path.join("..", "..", "config", "dataset")
config_files = [
    os.path.splitext(f)[0]
    for f in os.listdir(os.path.join(os.path.abspath("."), config_path[6:]))
    if f.endswith(".yaml")
]

In [None]:
def get_dataset_features(
    ratings: pd.DataFrame,
    user_id: str = "userId",
    item_id: str = "itemId",
    rating_col: str = "rating",
) -> dict:
    """
    Calculate various features from the ratings DataFrame.

    Parameters:
        ratings (pd.DataFrame): DataFrame containing user-item ratings.
        user_id (str): Name of the column representing user IDs.
        item_id (str): Name of the column representing item IDs.
        rating_col (str): Name of the column representing ratings.

    Returns:
        dict: A dictionary containing calculated features.
    """
    # 1-3 Basic features
    U = ratings[user_id].nunique()
    I = ratings[item_id].nunique()
    R = ratings.shape[0]

    # 4 SpaceSize
    SpaceSize = U * I

    # 5 Shape
    Shape = U / I

    # 6  Density
    Density = R / SpaceSize

    # 7, 8 Rating per user and Rating per item
    Rpu = R / U
    Rpi = R / I

    # 9, 10 Ginii and Giniu
    Ri_counts = ratings[item_id].value_counts().values[::-1]
    Ru_counts = ratings[user_id].value_counts().values[::-1]
    Ginii = 1 - 2 * np.sum((I + 1 - np.arange(1, I + 1)) * Ri_counts) / ((I + 1) * R)
    Giniu = 1 - 2 * np.sum((U + 1 - np.arange(1, U + 1)) * Ru_counts) / ((U + 1) * R)

    # 11-14 Polularity biases
    avg_item_popularity = ratings.groupby(item_id)[user_id].nunique() / U
    ratings = ratings.join(avg_item_popularity.rename("item_popularity", inplace=False), on=item_id)
    popularity_bias = ratings.groupby(user_id)["item_popularity"].mean()
    APB = popularity_bias.mean()
    StPB = popularity_bias.std()
    SkPB = skew(popularity_bias)
    KuPB = kurtosis(popularity_bias)

    # 15-18 Long tail items
    item_popularity = ratings[item_id].value_counts()
    item_popularity_cumsum = item_popularity.cumsum()
    LT_index = np.argmax(item_popularity_cumsum / R > 0.8)
    LT_items = set(item_popularity.iloc[LT_index:].index)
    LT_items_percentage = ratings.groupby(user_id)[item_id].apply(
        lambda x: len(set(x) & LT_items) / len(set(x)) if len(set(x)) > 0 else 0
    )
    LTavg = LT_items_percentage.mean()
    LTstd = LT_items_percentage.std()
    LTsk = skew(LT_items_percentage)
    LTku = kurtosis(LT_items_percentage)

    return {
        "Nu": U,
        "Ni": I,
        "Nr": R,
        "SpaceSize": SpaceSize,
        "Shape": Shape,
        "Density": Density,
        "Rpu": Rpu,
        "Rpi": Rpi,
        "Ginii": Ginii,
        "Giniu": Giniu,
        "APB": APB,
        "StPB": StPB,
        "SkPB": SkPB,
        "KuPB": KuPB,
        "LTavg": LTavg,
        "LTstd": LTstd,
        "LTsk": LTsk,
        "LTku": LTku,
    }

In [None]:
def get_cold_items(
    train_set: pd.DataFrame, val_set: pd.DataFrame, test_set: pd.DataFrame, column: str
):
    """
    Calculate the fraction of items that only appear in the validation set and the test set,
    but not in the training set.

    Parameters:
        train_set (pandas.DataFrame): The training dataset.
        val_set (pandas.DataFrame): The validation dataset.
        test_set (pandas.DataFrame): The test dataset.
        column (str): The name of the column containing the items.

    Returns:
        tuple: A tuple containing two fractions:
            - fraction_items_only_in_val_set (float): The fraction of items only in the validation set.
            - fraction_items_only_in_test_set (float): The fraction of items only in the test set.
    """
    items_train_set = set(train_set[column])
    items_val_set = set(val_set[column])
    items_test_set = set(test_set[column])
    fraction_items_only_in_val_set = len(items_val_set - items_train_set) / len(
        items_val_set
    )
    fraction_items_only_in_test_set = len(items_test_set - items_train_set) / len(
        items_test_set
    )
    return fraction_items_only_in_val_set, fraction_items_only_in_test_set

In [None]:
statistics_raw = dict()
statistics_filtered = dict()
statistics_wo_cold_iu = dict()
for config_file in tqdm(config_files, desc="Processing config files"):
    with initialize(config_path=config_path, version_base=None):
        cfg_data = compose(config_name=config_file)
    # init some constants
    user_id = cfg_data["user_column"]
    item_id = cfg_data["item_column"]
    rating_col = cfg_data["rating_column"]
    date_col = cfg_data["date_column"]
    splitting_conf = cfg_data["splitting"]
    # train_size = splitting_conf["train_size"]
    # val_size = splitting_conf["val_size"]
    # test_size = splitting_conf["test_size"]
    train_size = .8
    val_size = .1
    test_size = .1


    if train_size + val_size + test_size != 1:
        raise ValueError("Expected total fraction equal to 1")

    # get raw data
    ratings = pd.read_parquet(
        os.path.join(cfg_data["data_src"], cfg_data["ratings_file"])
    )

    # get raw dataset features
    statistics_raw[cfg_data["name"]] = get_dataset_features(
        ratings, user_id, item_id, rating_col
    )

    # get filtered dataset features
    ds = ClassicDataset()
    ds.prepare(cfg_data)
    interactions_processed = ds.prepared_data
    
    # get filtered dataset features
    statistics_filtered[cfg_data["name"]] = get_dataset_features(
        interactions_processed, user_id, item_id, rating_col
    )
    # Make train/val/test split without filtering
    interactions_processed.sort_values(date_col, inplace=True)
    date_at_val_percentile = interactions_processed[date_col].quantile(
        1 - val_size - test_size
    )
    date_at_test_percentile = interactions_processed[date_col].quantile(1 - test_size)

    train_set = interactions_processed[
        interactions_processed[date_col] <= date_at_val_percentile
    ]
    val_set = interactions_processed[
        (interactions_processed[date_col] > date_at_val_percentile)
        & (interactions_processed[date_col] <= date_at_test_percentile)
    ]
    test_set = interactions_processed[
        interactions_processed[date_col] > date_at_test_percentile
    ]

    # Get cold users
    cold_users_val, cold_users_test = get_cold_items(
        train_set, val_set, test_set, user_id
    )
    statistics_filtered[cfg_data["name"]]["ColdUsersVal"] = cold_users_val
    statistics_filtered[cfg_data["name"]]["ColdUsersTest"] = cold_users_test
    # Get cold items
    cold_items_val, cold_items_test = get_cold_items(
        train_set, val_set, test_set, item_id
    )
    statistics_filtered[cfg_data["name"]]["ColdItemsVal"] = cold_items_val
    statistics_filtered[cfg_data["name"]]["ColdItemsTest"] = cold_items_test

    # filtering Cold users and Items
    train_users = set(train_set[user_id])
    train_items = set(train_set[item_id])
    val_set = val_set[
        val_set[user_id].isin(train_users) & val_set[item_id].isin(train_items)
    ]
    test_set = test_set[
        test_set[user_id].isin(train_users) & test_set[item_id].isin(train_items)
    ]

    # get splitted and filtered dataset features
    data_splitted = pd.concat([train_set, val_set, test_set])
    data_splitted.reset_index(drop=True, inplace=True)
    
    statistics_wo_cold_iu[cfg_data["name"]] = get_dataset_features(
        data_splitted, user_id, item_id, rating_col
    )
    
    # some additional features
    statistics_wo_cold_iu[cfg_data["name"]]["FracInteractionsVal"] = len(val_set) / (
        len(data_splitted)
    )
    statistics_wo_cold_iu[cfg_data["name"]]["FracInteractionsTest"] = len(test_set) / (
        len(data_splitted)
    )
    
    statistics_wo_cold_iu[cfg_data["name"]]["FracPassiveUsersVal"] = val_set[user_id].nunique() / (
        data_splitted[user_id].nunique()
    )
    statistics_wo_cold_iu[cfg_data["name"]]["FracPassiveUsersTest"] = test_set[user_id].nunique() / (
        data_splitted[user_id].nunique()
    )
    
    statistics_wo_cold_iu[cfg_data["name"]]["FracPassiveItemsVal"] = val_set[item_id].nunique() / (
        data_splitted[item_id].nunique()
    )
    statistics_wo_cold_iu[cfg_data["name"]]["FracPassiveItemsTest"] = test_set[item_id].nunique() / (
        data_splitted[item_id].nunique()
    )
    
    # timestamps 
    # full data
    if len(str(int(ratings[date_col][0]))) > 11:
        unit = 'ns'
    elif len(str(int(ratings[date_col][0]))) > 6:
        unit = 's'
    else:
        unit = 'W'
        
    data_splitted[date_col] = pd.to_datetime(data_splitted[date_col], unit=unit)
    statistics_wo_cold_iu[cfg_data["name"]]["MinDate"] = data_splitted[date_col].min().to_pydatetime()
    statistics_wo_cold_iu[cfg_data["name"]]["MaxDate"] = data_splitted[date_col].max().to_pydatetime()

    # validation Set
    val_set[date_col] = pd.to_datetime(val_set[date_col], unit=unit)
    statistics_wo_cold_iu[cfg_data["name"]]["ValDate"] = val_set[date_col].min().to_pydatetime()
    statistics_wo_cold_iu[cfg_data["name"]]["DeltaVal"] = (val_set[date_col].max() - val_set[date_col].min()).days

    # test Set
    test_set[date_col] = pd.to_datetime(test_set[date_col], unit=unit)
    statistics_wo_cold_iu[cfg_data["name"]]["TestDate"] =  test_set[date_col].min().to_pydatetime()
    statistics_wo_cold_iu[cfg_data["name"]]["DeltaTest"] = (test_set[date_col].max() - test_set[date_col].min()).days

with open(os.path.join("results", "datasets_statistics", "statistics_raw.pcl"), "wb") as f:
    pickle.dump(statistics_raw, f)
with open(os.path.join("results", "datasets_statistics", "statistics_filtered.pcl"), "wb") as f:
    pickle.dump(statistics_filtered, f)
with open(os.path.join("results", "datasets_statistics", "statistics_wo_cold_iu.pcl"), "wb") as f:
    pickle.dump(statistics_wo_cold_iu, f)

In [None]:
with open(os.path.join("results", "datasets_statistics", "statistics_raw.pcl"), "rb") as f:
    statistics_raw = pickle.load(f)
with open(os.path.join("results", "datasets_statistics", "statistics_filtered.pcl"), "rb") as f:
    statistics_filtered = pickle.load(f)
with open(os.path.join("results", "datasets_statistics", "statistics_wo_cold_iu.pcl"), "rb") as f:
    statistics_wo_cold_iu = pickle.load(f)

In [None]:
# Convert the dictionary to a pandas DataFrame
statistics_raw_df = pd.DataFrame(statistics_raw).T
statistics_filtered_df = pd.DataFrame(statistics_filtered).T
statistics_wo_cold_iu_df = pd.DataFrame(statistics_wo_cold_iu).T

In [None]:
statistics_wo_cold_iu_df

In [None]:
statistics_wo_cold_iu_df.SpaceSize.sort_values()

In [None]:
statistics_wo_cold_iu_df.keys()

In [None]:
statistics_wo_cold_iu_df.Density.sort_values()[::-1]

In [None]:
columns = ['FracPassiveUsersVal', 'FracPassiveItemsVal', 'FracPassiveUsersTest', 'FracPassiveItemsTest']

for column in columns:
    data = [statistics_wo_cold_iu[key][column] for key in statistics_wo_cold_iu]
    sns.histplot(data, kde=True, color='skyblue', bins=10)
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.title(column)
    plt.show()

In [None]:
# Histograms for splitting 
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))

columns = ['ColdUsersVal', 'ColdUsersTest', 'ColdItemsVal', 'ColdItemsTest']
for i, column in enumerate(columns):
    row, col = i // 2, i % 2
    statistics_filtered_df[column].hist(ax=axes[row, col], color='skyblue', alpha=0.7, bins=30)
    axes[row, col].set_title(column)
    axes[row, col].set_xlabel("Value")
    axes[row, col].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
statistics_wo_cold_iu_df[(statistics_wo_cold_iu_df.FracInteractionsTest < 0.02) | (statistics_wo_cold_iu_df.FracInteractionsVal < 0.01)]

In [None]:
statistics_filtered_df[(statistics_filtered_df.ColdUsersTest > 0.9) | (statistics_filtered_df.ColdItemsTest > 0.9)]

In [None]:
statistics_wo_cold_iu_df.dropna(inplace=True)

In [None]:
data = statistics_wo_cold_iu_df[['Nu', 'Ni', 'Nr', 'SpaceSize', 'Shape', 'Density', 'Rpu', 'Rpi',
       'Ginii', 'Giniu', 'APB', 'StPB', 'SkPB', 'KuPB', 'LTavg', 'LTstd',
       'LTsk', 'LTku', 'FracInteractionsVal', 'FracInteractionsTest',
       'FracPassiveUsersVal', 'FracPassiveUsersTest', 'FracPassiveItemsVal',
       'FracPassiveItemsTest']]

In [None]:
# Perform PCA with 2 components
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(data)

# Create a new DataFrame for the PCA results
pca_df = pd.DataFrame(data=pca_result, columns=["PC1", "PC2"], index=data.index)

# Plot the scatter plot of the first two principal components
plt.figure(figsize=(18, 12))
plt.scatter(pca_df["PC1"], pca_df["PC2"], s=100)
plt.title("PCA - Datasets Comparison")
plt.grid()

# Add text labels with adjust_text to avoid overlapping
texts = [plt.text(x, y, dataset, fontsize=12) for dataset, (x, y) in pca_df.iterrows()]
adjust_text(texts, arrowprops=dict(arrowstyle="-", color="black", lw=0.5))

plt.show()

In [None]:
# Plot t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(data)

plt.figure(figsize=(18, 12))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.8)

# Add labels using dataset names
texts = []
for i, dataset in enumerate(data.index):
    texts.append(plt.text(tsne_results[i, 0], tsne_results[i, 1], dataset))

adjust_text(
    texts,
    arrowprops=dict(arrowstyle="-", color="gray"),
    autoalign="xy",
    force_points=0.1,
    force_text=0.1,
)

plt.title("t-SNE Visualization of Datasets")
plt.show()

In [None]:
# Dimension reduction using UMAP
umap_reducer = umap.UMAP(n_neighbors=5, min_dist=0.1, metric="euclidean", random_state=42)
umap_results = umap_reducer.fit_transform(data)

# Clustering using KMeans
n_clusters = 5  
kmeans = KMeans(n_clusters=n_clusters)
kmeans_labels = kmeans.fit_predict(umap_results)

# Get the size of each cluster
cluster_sizes = np.bincount(kmeans_labels)

# Calculate the scaling factor for circle sizes
max_circle_size = 200 
scaling_factor = max_circle_size / np.max(cluster_sizes)

# Plot UMAP with cluster colors and circle sizes
plt.figure(figsize=(18, 12))
for cluster_num in range(n_clusters):
    cluster_points = umap_results[kmeans_labels == cluster_num]
    cluster_size = cluster_sizes[cluster_num]
    circle_size = cluster_size * scaling_factor
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], s=circle_size, label=f"Cluster {cluster_num + 1}")

# Add labels using dataset names
texts = []
for i, dataset in enumerate(data.index):
    texts.append(plt.text(umap_results[i, 0], umap_results[i, 1], dataset))

adjust_text(
    texts,
    arrowprops=dict(arrowstyle="-", color="gray"),
    autoalign="xy",
    force_points=0.1,
    force_text=0.1,
)

plt.title("UMAP Visualization of Datasets with Clusters")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.legend()
plt.show()

In [None]:
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(data), columns=data.columns, index=data.index
)
plt.figure(figsize=(24, 18))
sns.heatmap(df_normalized, annot=True, cmap="YlGnBu", vmin=0, vmax=1, linewidths=0.5)
plt.title("Comparison of Datasets Features")
plt.xlabel("Features")
plt.ylabel("Datasets")
plt.show()

In [None]:
plt.figure(figsize=(18, 6))
sns.boxplot(data=df_normalized, palette="Set3")
plt.title("Distribution of Feature Values across Datasets")
plt.xlabel("Features")
plt.xticks(rotation=45)
plt.ylabel("Feature Values")
plt.show()

In [None]:
plt.figure(figsize=(18, 6))
sns.violinplot(data=df_normalized, inner="quart")
plt.title("Distribution of Feature Values across Datasets")
plt.xlabel("Features")
plt.xticks(rotation=45)
plt.ylabel("Feature Values")
plt.show()

In [None]:
# Create separate figures for each feature's histogram
for feature in data.columns:
    plt.figure(figsize=(6, 4))
    plt.hist(data[feature], bins=30)
    plt.xlabel("Feature Values")
    plt.ylabel("Frequency")
    plt.title(f"Distribution of {feature} in Datasets")
    plt.grid()
    plt.show()