In [52]:
import pandas as pd
import rbo
import seaborn as sns

In [53]:
# Parameters
baseline_recommendations = None
sampled_recommendations = None

In [55]:
# Import data
baseline_df = pd.read_json(baseline_recommendations)
sampled_df = pd.read_json(sampled_recommendations)

In [None]:
# Merge datasets
df = pd.merge(baseline_df, sampled_df, on="user", how="inner", suffixes=["_base", "_sampled"])

# Check all users are present after merge
assert(len(df) == len(baseline_df))

df.head()

In [None]:
# Data Enrichment
df["top_3_base"] = df["recommendations_base"].apply(lambda x: x[:3] if len(x) >= 3 else x)
df["top_3_sampled"] = df["recommendations_sampled"].apply(lambda x: x[:3] if len(x) >= 3 else x)

# Request duration

In [None]:
# Simple request duration
print(df["duration_base"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["duration_base"], kde=False, rug=True)

In [None]:
# Sampled request duration
print(df["duration_sampled"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["duration_sampled"], kde=False, rug=True)

In [None]:
# Comparing simple and sampled request durations
sns.jointplot(x="duration_base", y="duration_sampled", data=df);

# Number of recommendations

In [None]:
# Simple recommendations
df["recommendations_base_count"] = df["recommendations_base"].apply(lambda x: len(x))

print("Number of zero recommendations", df[df["recommendations_base_count"] == 0]["user"].count())
print(df["recommendations_base_count"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

sns.distplot(df["recommendations_base_count"], kde=False, rug=True)

In [None]:
# Sampled recommendations
df["recommendations_sampled_count"] = df["recommendations_sampled"].apply(lambda x: len(x))

print("Number of zero recommendations", df[df["recommendations_sampled_count"] == 0]["user"].count())
print(df["recommendations_sampled_count"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

sns.distplot(df["recommendations_sampled_count"], kde=False, rug=True)

# Number of common recommendations

In [None]:
# Compute common recommended tweets (without respecting rank order):
def set_overlap(row):
    base = set(row["recommendations_base"])
    sampled = set(row["recommendations_sampled"])
    return len(base.intersection(sampled)) / len(base) if len(base) > 0 else 0

df["set_overlap"] = df.apply(set_overlap, axis=1)
print(df["set_overlap"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["set_overlap"], kde=False, rug=True)

In [None]:
# Compute common top 3 recommended tweets (without respecting rank order):
def set_overlap(row):
    base = set(row["top_3_base"])
    sampled = set(row["top_3_sampled"])
    return len(base.intersection(sampled)) / len(base) if len(base) > 0 else 0

df["set_overlap_top_3"] = df.apply(set_overlap, axis=1)
print(df["set_overlap_top_3"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["set_overlap_top_3"], kde=False, rug=True)

# Number of missing recommendations

In [None]:
# Compute count of missing recommendations
def missing_recommendations(row):
    base = set(row["recommendations_base"])
    sampled = set(row["recommendations_sampled"])
    return len(base) - len(base.intersection(sampled))

df["missing_recommendations"] = df.apply(missing_recommendations, axis=1)
print(df["missing_recommendations"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["missing_recommendations"], kde=False, rug=True)

# Compare rankings

In [None]:
# Compute rank biased overlap for similarity between ranked lists
def rank_biased_overlap(row):
    base = row["recommendations_base"]
    sampled = row["recommendations_sampled"]
    result = rbo.RankingSimilarity(base, sampled).rbo()
    return result
    
df["rank_biased_overlap"] = df.apply(rank_biased_overlap, axis=1)
print(df["rank_biased_overlap"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["rank_biased_overlap"], kde=False, rug=True)

In [None]:
# Number of base recommendations vs Rank Bias Overlap (list order)
sns.jointplot(x="recommendations_base_count", y="rank_biased_overlap", data=df);

In [None]:
# Set overlap (common recommendations) vs Rank Bias Overlap (list order)
sns.jointplot(x="set_overlap", y="rank_biased_overlap", data=df);

In [None]:
#  Compute rank biased overlap for top 3 recommendations
def rank_biased_overlap(row):
    base = row["top_3_base"]
    sampled = row["top_3_sampled"]
    result = rbo.RankingSimilarity(base, sampled).rbo()
    return result

df["rank_biased_overlap_top_3"] = df.apply(rank_biased_overlap, axis=1)
print(df["rank_biased_overlap_top_3"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
sns.distplot(df["rank_biased_overlap_top_3"], kde=False, rug=True)

# Inspect Examples

In [None]:
df[df["recommendations_base_count"] > 1]