In [52]:
import functools as ft
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rbo
import seaborn as sns

In [53]:
app_ids = ["simple", "sampling", "range-key", "segmented"]
recommendations = ["../out/simple-recommendations.json", "../out/sampling-recommendations.json", "../out/range-key-recommendations.json", "../out/segmented-recommendations.json"]

In [55]:
# Import data
dfs = []

for i, recommendation in enumerate(recommendations):
    dfs.append(pd.read_json(recommendation, orient="table"))

In [None]:
# Merge Datasets
# Init merge with first DataFrame
df = dfs[0]
join_columns = ["user", "walks", "walk_length", "limit"]
duplicated_columns = [column for column in df.columns.to_list() if column not in join_columns]

# Merge all DataFrames
for i, right_df in enumerate(dfs):
    app_id = app_ids[i]
    df = pd.merge(df, right_df, on=join_columns, how="inner", suffixes=("", f"_{app_id}"))

# Drop first DataFrame (is duplicated in dataset)
df = df.drop(duplicated_columns, axis=1)

# Ensure that all users are present after merge
for original_df in dfs:
    assert(len(df) == len(original_df))

df.head()

# Data Enrichment

In [None]:
# Calculate top 3 recommendations
for app_id in app_ids:
    df[f"top_3_recommendations_{app_id}"] = df[f"recommendations_{app_id}"].apply(lambda x: x[:3] if len(x) >= 3 else x)

# Utils

In [None]:
def triangular_mask(dim):
    """Create a mask of ones for half a square. Used to mask heatmaps."""
    mask = np.zeros((dim, dim))
    for y in range(0, dim):
        for x in range(dim - 1, y, -1):
            mask[y][x] = 1
    return mask

mask = triangular_mask(len(app_ids))

def heatmap(frame, title="", vmin=0, vmax=1):
    """Plot heatmap of pivoted dataframe"""
    mask = triangular_mask(len(frame.index.unique()))
    plt.title(title)
    ax = sns.heatmap(frame, mask=mask, vmin=vmin, vmax=vmax, annot=True, cmap="YlGnBu", square=True)
    ax.set_ylabel("")    
    ax.set_xlabel("")
    return ax

# Request duration

In [None]:
durations = []
walks = []
walk_lengths = []
styles = []

for app_id in app_ids:
    durations.extend(df[f"duration_{app_id}"].to_list())
    walks.extend(df[f"walks"].to_list())
    walk_lengths.extend(df[f"walk_length"].to_list())
    styles.extend([app_id for i in range(0, len(df))])
    
duration_df = pd.DataFrame({"walks": walks, "walk_length": walk_lengths, "duration": durations, "app_id": styles}) 

In [None]:
f, ax = plt.subplots(figsize=(8,8))
ax = sns.stripplot(x="walks", y="duration", hue="app_id", jitter=True, palette="Set1", alpha=.5, edgecolor="gray", dodge=True, data=duration_df)
ax.set(xlabel="Number of random walks", ylabel="Request Duration (seconds)")
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8,8))
ax = sns.stripplot(x="walk_length", y="duration", hue="app_id", jitter=True, palette="Set1", alpha=.5, edgecolor="gray", dodge=True, data=duration_df)
ax.set(xlabel="Length of random walks", ylabel="Request Duration (seconds)")
plt.show()

In [None]:
heatmap(duration_df.corr())

In [None]:
ax = sns.catplot(x="app_id", y="duration", row="walks", col="walk_length", kind="box", data=duration_df, palette="Set1")
ax.set(xlabel="", ylabel="Request Duration (s)")

# Number of common recommendations / Set overlap

In [None]:
# Compute common recommended tweets (without respecting rank order):
def set_overlap_per_row(row, app_base, app_variant):
    base = set(row[app_base])
    variant = set(row[app_variant])
    return len(base.intersection(variant)) / len(base) if len(base) > 0 else 0

def set_overlap(frame, column_prefix=""):
    base_apps = []
    variant_apps = []
    set_overlaps = []

    for app_base in app_ids:
        base = f"{column_prefix}recommendations_{app_base}"
        
        for app_variant in app_ids:
            variant = f"{column_prefix}recommendations_{app_variant}"
            set_overlaps.extend(df.apply(lambda x: set_overlap_per_row(x, base, variant), axis=1).to_list())
            base_apps.extend(df.apply(lambda x: app_base, axis=1).to_list())
            variant_apps.extend(df.apply(lambda x: app_variant, axis=1).to_list())
    
    set_overlap_df = pd.DataFrame({"base": base_apps, "variant": variant_apps, "set_overlap": set_overlaps})
    set_overlap_df = set_overlap_df.groupby(["base", "variant"])["set_overlap"].mean()
    set_overlap_df = set_overlap_df.reset_index()
    set_overlap_df = set_overlap_df.pivot(index="base", columns="variant", values="set_overlap")
    return set_overlap_df

In [None]:
set_overlap_df = set_overlap(df)
heatmap(set_overlap_df, "Set overlap (top 10)")

In [None]:
set_overlap_df = set_overlap(df, "top_3_")
heatmap(set_overlap_df, "Set overlap (top 3)")

# Compare rankings

In [None]:
# Compute rank biased overlap for similarity between ranked lists
def rank_biased_overlap_row(row, base_app, variant_app):
    base = row[base_app]
    variant = row[variant_app]
    result = rbo.RankingSimilarity(base, variant).rbo()
    return result


def rank_biased_overlap(frame, column_prefix=""):
    base_apps = []
    variant_apps = []
    rank_overlaps = []

    for app_base in app_ids:
        base = f"{column_prefix}recommendations_{app_base}"
        
        for app_variant in app_ids:
            variant = f"{column_prefix}recommendations_{app_variant}"
            rank_overlaps.extend(df.apply(lambda x: rank_biased_overlap_row(x, base, variant), axis=1).to_list())
            base_apps.extend(df.apply(lambda x: app_base, axis=1).to_list())
            variant_apps.extend(df.apply(lambda x: app_variant, axis=1).to_list())
    
    rank_biased_overlap_df = pd.DataFrame({"base": base_apps, "variant": variant_apps, "rank_biased_overlap": rank_overlaps})
    rank_biased_overlap_df = rank_biased_overlap_df.groupby(["base", "variant"])["rank_biased_overlap"].mean()
    rank_biased_overlap_df = rank_biased_overlap_df.reset_index()
    rank_biased_overlap_df = rank_biased_overlap_df.pivot(index="base", columns="variant", values="rank_biased_overlap")
    return rank_biased_overlap_df

In [None]:
rank_biased_overlap_df = rank_biased_overlap(df)
heatmap(rank_biased_overlap_df, "Rank-biased overlap (top 10)")

In [None]:
rank_biased_overlap_df = rank_biased_overlap(df, "top_3_")
heatmap(rank_biased_overlap_df, "Rank-biased overlap (top 3)")

In [None]:
for walks in [100, 1000, 10000]:
    for walk_length in [100, 1000, 10000]:
        print(f"Walks: {walks}, Length: {walk_length}")
        frame = df[(df["walk_length"] == walk_length) & (df["walks"] == walks)]
        rank_biased_overlap_df = rank_biased_overlap(frame)
        ax = heatmap(rank_biased_overlap_df, "Rank-biased overlap (top 10)")
        plt.show()

# Missing Recommendations

In [None]:
# Compute common recommended tweets (without respecting rank order):
def missing_recommendations_per_row(row, app_base, app_variant):
    base = set(row[app_base])
    variant = set(row[app_variant])
    return len(base) - len(base.intersection(variant))

def missing_recommendations(frame, column_prefix=""):
    base_apps = []
    variant_apps = []
    set_overlaps = []

    for app_base in app_ids:
        base = f"{column_prefix}recommendations_{app_base}"
        
        for app_variant in app_ids:
            variant = f"{column_prefix}recommendations_{app_variant}"
            set_overlaps.extend(df.apply(lambda x: missing_recommendations_per_row(x, base, variant), axis=1).to_list())
            base_apps.extend(df.apply(lambda x: app_base, axis=1).to_list())
            variant_apps.extend(df.apply(lambda x: app_variant, axis=1).to_list())
    
    missing_recommendations_df = pd.DataFrame({"base": base_apps, "variant": variant_apps, "missing_recommendations": set_overlaps})
    missing_recommendations_df = missing_recommendations_df.groupby(["base", "variant"])["missing_recommendations"].mean()
    missing_recommendations_df = missing_recommendations_df.reset_index()
    missing_recommendations_df = missing_recommendations_df.pivot(index="base", columns="variant", values="missing_recommendations")
    return missing_recommendations_df

In [None]:
missing_recommendations_df = missing_recommendations(df)
heatmap(missing_recommendations_df, "Average missing recommendations (top 10)", vmax=4)

In [None]:
missing_recommendations_df = missing_recommendations(df, "top_3_")
heatmap(missing_recommendations_df, "Average missing recommendations (top 3)", vmax=4)