In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("../results/smooth_comparison.csv")

fig, axis = plt.subplots(nrows=1, ncols=4, figsize=(18, 4))
thetas = [1, 10, 100, 500]
alphas = [0.0, 0.2, 0.4, 0.6, 0.8]

for i, r in enumerate(range(0, len(thetas) * len(alphas), len(alphas))):
    vals = df.values[r:r+len(alphas), :]
    for j in range(len(alphas)):
        axis[i].plot(np.exp(vals[j, :]), label=r"$\alpha$: {0}".format(alphas[j]))
    axis[i].set_yscale("log")
    axis[i].set_title(r"$\theta$: {0}".format(thetas[i]), fontsize=20)
    axis[i].set_ylim(1e-5, 1)
    
for i in range(4):
    axis[i].set_xlabel(r"$r$", fontsize=18)

axis[2].legend(fontsize=18, ncol=len(alphas), loc="lower left", bbox_to_anchor=(-1.75, -0.4))



plt.savefig("../plots/smoothing_comparison.pdf", bbox_inches="tight")

In [None]:
vals.shape

# Frequency One Hash

In [None]:
df = pd.read_csv("../results/frequency_simulation_results.csv")
df = df[df.Model.isin(["DP", "NGG"])]

In [None]:
df

In [None]:
med_df = df[df.DataGen == "py"].groupby(["DataGen", "Params", "Model"]).median().reset_index()
med_df["theta"] = [x[0] for x in med_df.Params.apply(json.loads)]
med_df["alpha"] = [x[1] for x in med_df.Params.apply(json.loads)]
med_df = med_df.drop(["DataGen", "repnum", "Params"], axis=1)
columns = ["theta", "alpha", "Model", "(0, 1]", "(1, 4]", "(4, 16]", "(16, 64]", "(64, 256]", "(256, Inf]"]
med_df = med_df[columns]
print(med_df.to_latex(index=False, float_format="%.2f"))

In [None]:
for alpha in [0.0, 0.25, 0.5, 0.75]:
    curr_df = df[df.DataGen == "py"]
    
    p_list = ["[{0}, {1}]".format(a, alpha) for a in [1.0, 10.0, 100.0, 1000.0]]
    curr_df = curr_df[curr_df.Params.isin(p_list)]
    curr_df = curr_df.groupby(["Params", "Model"]).median().reset_index()
    curr_df["theta"] = [x[0] for x in curr_df.Params.apply(json.loads)]

    fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))

    for model in ["DP", "NGG"]:
        mdf = curr_df[curr_df["Model"] == model]
        for i in range(5):
            axis[i].plot(mdf.theta, mdf[mdf.columns[i+2]], label=model)
            axis[i].set_title(mdf.columns[i+2], fontsize=20)
            axis[i].set_xscale("log")
            # axis[i].set_yscale("log")
            axis[i].set_ylim(1, 5000)

    for i in range(4):
        axis[i].set_xlabel(r"$\theta$", fontsize=18)

    axis[0].set_ylabel("MAE", fontsize=18)
   # plt.tight_layout(pad=0.1)
    axis[2].legend(ncol=3, loc="lower left", bbox_to_anchor=(-0.2, -0.55), fontsize=18)
    plt.savefig("../plots/freq_py_alpha_{0}.pdf".format(alpha), bbox_inches="tight")

In [None]:
curr_df = df[df.DataGen == "zipf"]
curr_df = curr_df.groupby(["Params", "Model"]).median().reset_index()
curr_df

In [None]:
# curr_df = df[df.DataGen == "zipf"]
# curr_df.Model[curr_df.Model == "DP"] = "TMP"
# curr_df.Model[curr_df.Model == "NGG"] = "DP"
# curr_df.Model[curr_df.Model == "TMP"] = "NGG"

curr_df = pd.read_csv("../results/zipf_frequency_simulation_results.csv")

curr_df = curr_df.groupby(["Params", "Model"]).median().reset_index()
# curr_df["Params"] = curr_df["Params"].apply(json.loads)

fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))

for model in ["DP", "NGG"]:
    mdf = curr_df[curr_df["Model"] == model]
    for i in range(5):
        axis[i].plot(mdf.Params, mdf[mdf.columns[i+2]], label=model)
        axis[i].set_title(mdf.columns[i+2], fontsize=20)
        axis[i].set_ylim(1, 1500)

for i in range(5):
    axis[i].set_xlabel(r"$c$", fontsize=18)
    
axis[0].set_ylabel("MAE", fontsize=18)
axis[2].legend(ncol=3, loc="lower left", bbox_to_anchor=(-0.2, -0.5), fontsize=18)
#plt.xscale("log")

plt.savefig("../plots/freq_zipf.pdf".format(alpha), bbox_inches="tight")

In [None]:
med_df.

# Cardinality 1 Hash

In [None]:
df = pd.read_csv("../results/card_plot_df.csv")
df

In [None]:
df.TrueK.max()

In [None]:
import json
from mpl_toolkits.axes_grid1 import make_axes_locatable

df = pd.read_csv("../results/card_plot_df.csv")
df["theta"] = [x[0] for x in df.Params.apply(json.loads)]
df["alpha"] = [x[1] for x in df.Params.apply(json.loads)]

# df.loc[df.isna()["NGG"], "NGG"] = df[df.isna()["NGG"]]["DP"]

fig, axis = plt.subplots(nrows=3, ncols=4, figsize=(18, 14))
ndata = [100, 1000, 10000, 100000]
alphas = [0.0, 0.25, 0.5, 0.75]
thetas = [10.0, 100.0, 1000.0]

for i, t in enumerate(thetas):
    for j, a in enumerate(alphas):
        curr_df = df[(df.theta == t) & (df.alpha == a)]        
        axis[i, j].plot(ndata, curr_df.TrueK, color="red", lw=3, label="True")
        axis[i, j].plot(ndata, curr_df.DP, label="DP")
        axis[i, j].plot(ndata, curr_df.NGG, label="NGG")
        
        if i == 0:
            axis[i, j].set_ylim(-10, 3500)
        elif i == 1:
            axis[i, j].set_ylim(-10, 7000)
        elif i == 2:
            axis[i, j].set_ylim(-10, 12000)
 
        axis[i, j].set_xscale("log")
        # axis[i, j].set_yscale("log")
        # axis[i, j].set_ylim(1.0, 40000)
    
        axis[i, j].set_title("PY({0:.0f}, {1:.2f})".format(t, a), fontsize=20)

for i in range(3):
    axis[i, 0].set_ylabel(r"$|\hat K - K^{true}|$", fontsize=18)
    for j in range(4):
        axis[i, j].set_xlabel("$n$", fontsize=18)

axis[2, 2].legend(loc="lower left", ncol=4, bbox_to_anchor=(-1, -0.4), fontsize=18)
plt.savefig("../plots/card.pdf", bbox_inches="tight")

In [None]:
df = pd.read_csv("../results/card_simulation_results.csv")
df.head()

In [None]:
df = df[df.Model.isin(["DP", "NGG"])]
med_df = df[df.DataGen == "py"].groupby(["DataGen", "Params", "Model"]).median().reset_index()
med_df["theta"] = [x[0] for x in med_df.Params.apply(json.loads)]
med_df["alpha"] = [x[1] for x in med_df.Params.apply(json.loads)]
med_df = med_df.drop(["DataGen", "repnum", "Params"], axis=1)
columns = ["theta", "alpha", "Model", "100", "1000", "10000", "100000"]
print(med_df[columns].to_latex(index=False, float_format="%.2f"))

In [None]:
df = pd.read_csv("../results/zipf_card_simulation_results.csv")
med_df = df[df.DataGen == "zipf"].groupby(["DataGen", "Params", "Model"]).median().reset_index()
med_df

In [None]:
df = pd.read_csv("../results/zipf_card_simulation_results.csv")

med_df = df[df.DataGen == "zipf"].groupby(["DataGen", "Params", "Model"]).median().reset_index()
# med_df["c"] = [x for x in med_df.Params.apply(json.loads)]
# med_df.c = med_df.Params
med_df = med_df.drop(["DataGen", "repnum"], axis=1)
columns = ["Params", "Model", "100", "1000", "10000", "100000"]
print(med_df[columns].to_latex(index=False, float_format="%.2f"))

# Role of J

In [None]:
df.head()

In [None]:
df
df.iloc[range(8, 16)]

In [None]:
df = pd.read_csv("../results/jeffect_freq_simulation_results.csv")
# med_df = df.groupby(["Params", "Model", "J"]).median().reset_index()
med_df = df.iloc[range(8, 16)]
med_df

In [None]:
df = pd.read_csv("../results/jeffect_freq_simulation_results.csv")
# med_df = df.groupby(["Params", "Model", "J"]).median().reset_index()
med_df = df.iloc[range(16, 32)]


theta = 100
alpha = 0.75
params = "[{0:.1f}, {1:.2f}]".format(theta, alpha)
 
fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))

curr_df = med_df[med_df.Params == params]
for model in ["DP", "NGG"]:
    mdf = curr_df[curr_df["Model"] == model]
    for i in range(5):
        axis[i].plot(mdf.J, mdf[mdf.columns[i]], label=model)
        axis[i].set_title(mdf.columns[i], fontsize=20)
        axis[i].set_ylim(10, 500)
        # axis[i].set_yscale("log")
        axis[i].set_xscale("log")

for i in range(5):
    axis[i].set_xlabel(r"$J$", fontsize=18)
axis[0].set_ylabel("MAE", fontsize=18)
        
axis[2].legend(ncol=3, loc="lower left", bbox_to_anchor=(-0.35, -0.55), fontsize=18)
plt.savefig("../plots/maes_jeffect_alpha_{0:.2f}.pdf".format(alpha), bbox_inches="tight")

In [None]:
df = pd.read_csv("../results/jeffect_card_simulation_results.csv")
med_df = df.groupby(["Params", "Model"]).median().reset_index()
med_df

In [None]:
curr_df

In [None]:
df = pd.read_csv("../results/jeffect_card_simulation_results.csv")
med_df = df.groupby(["Params", "Model"]).median().reset_index()


theta = 100
alpha = 0.75
params = "[{0:.1f}, {1:.2f}]".format(theta, alpha)

Js = [10, 100, 1000, 10000]
 
curr_df = med_df[med_df.Params == params]
for model in ["DP", "NGG"]:
    plt.plot(Js, curr_df[curr_df.Model == model].values[0, 2:], label=model)
    
plt.xscale("log")
plt.legend(fontsize=18)
    
        
# axis[2].legend(ncol=3, loc="lower left", bbox_to_anchor=(-0.35, -0.4), fontsize=18)
plt.savefig("../plots/card_jeffect_alpha_{0:.2f}.pdf".format(alpha), bbox_inches="tight")

# Frequency Multiview

In [None]:
df_min = pd.read_csv("../results/multiview_min_simulation_results.csv")
df_prod = pd.read_csv("../results/multiview_prod_simulation_results.csv")
df_cms = pd.read_csv("../results/multiview_cms_simulation_results.csv")

In [None]:
df_min["Rule"] = "MIN"
df_prod["Rule"] = "PoE"
df_cms["Rule"] = "CMS"

In [None]:
df = pd.concat([df_min, df_prod, df_cms])
df = df.groupby(["DataGen", "Params", "J", "Rule", "Model"]).median().reset_index()

In [None]:
df.head()

In [None]:
med_df = pd.concat([df_min, df_prod, df_cms])
med_df = med_df.groupby(["DataGen", "Params", "J", "Model", "Rule"]).median().reset_index()
med_df["theta"] = [x[0] for x in med_df.Params.apply(json.loads)]
med_df["alpha"] = [x[1] for x in med_df.Params.apply(json.loads)]
med_df = med_df.drop(["DataGen", "repnum", "Params"], axis=1)
columns = ["theta", "alpha", "J", "Model", "Rule", "(0, 1]", "(1, 4]", "(4, 16]", "(16, 64]",
           "(64, 256]", "(256, Inf]"]
print(med_df[columns].to_latex(index=False, float_format="%.2f"))

In [None]:
print(med_df[med_df.J == 1000][columns].to_latex(index=False, float_format="%.2f"))

In [None]:
paramslist = [
    "[100.0, 0.25]",
    "[100.0, 0.75]"
]

# params = "[100.0, 0.25]"

for pnum, params in enumerate(paramslist):
    fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))
    
    subdf = df[df.Params == params]
    for i in range(5):
        poe_df = subdf[subdf.Rule == "PoE"]
        for mod, col in zip(models, colors):
            currdf = poe_df[poe_df.Model == mod]
            axis[i].plot(currdf.J, currdf[currdf.columns[i + 5]], color=col, label="PoE - {0}".format(mod))

        min_df = subdf[subdf.Rule == "MIN"]
        for mod, col in zip(models, colors):
            currdf = min_df[min_df.Model == mod]
            axis[i].plot(currdf.J, currdf[currdf.columns[i + 5]], "--", color=col, label="MIN - {0}".format(mod))

        cms_df = subdf[subdf.Rule == "CMS"]
        axis[i].plot(cms_df.J, cms_df[cms_df.columns[i + 5]], label="CMS", color="purple")

        #axis[i].plot(min_df.J, min_df[min_df.columns[i+4]], label="MIN")
        #axis[i].set_xlim(0, 1100)
        #axis[i].set_xticks(min_df.J)
        axis[i].set_title(min_df.columns[i + 5], fontsize=20)
        axis[i].set_xlabel(r"$J$", fontsize=18)
        
    axis[0].set_ylabel("MAE", fontsize=18)

    axis[2].legend(loc="lower center", ncol=5, bbox_to_anchor=(0.5,-0.55), fontsize=18)
    plt.savefig("../plots/multiview_{0}.pdf".format(pnum + 1), bbox_inches="tight")

In [None]:
df["theta"] = [x[0] for x in df.Params.apply(json.loads)]
df["alpha"] = [x[1] for x in df.Params.apply(json.loads)]
df = df.drop(["DataGen", "repnum", "Params"], axis=1)
columns = ["theta", "alpha", "J", "AGG", "(0, 1]", "(1, 4]", "(4, 16]", "(16, 64]", "(64, 256]", "(256, Inf]"]
df = df[columns]
print(df.to_latex(index=False, float_format="%.2f"))

# Cardinality Multi Hash

In [None]:
df = pd.read_csv("../results/multiview_card_simulation_results.csv")

In [None]:
df = df.groupby(["ndata", "PY_THETA", "PY_ALPHA", "model", "mean_fn"]).median().reset_index()

In [None]:
fig, axis = plt.subplots(nrows=1, ncols=4, figsize=(18, 3))

plot_params = [(100, 0.25), (100, 0.75), (1000, 0.25), (1000, 0.75)]

for i, (theta, alpha) in enumerate(plot_params):
    pdf = df[(df.PY_THETA == theta) & (df.PY_ALPHA == alpha) & (df.ndata == 250000)]
    for model, color in zip(["DP", "NGG"], ["steelblue", "orange"]):
        for mean, ls in zip(["avg", "geom"], ["--", ":"]):
            curr_df = pdf[(pdf.model == model) & (pdf.mean_fn == mean)]
            axis[i].plot([50, 100, 500, 1000], curr_df.values[0, 5:9].astype(float),
                         color=color, linestyle=ls, label="{0}-{1}".format(model, mean))
            
for i in range(4):
    axis[i].set_title("PYP({0}, {1})".format(*plot_params[i]), fontsize=20)
    axis[i].set_xlabel("J", fontsize=18)
    
axis[0].set_ylabel(r"$|K_n - \hat K|$", fontsize=18)
axis[2].legend(loc="lower center", ncol=5, bbox_to_anchor=(-0.2,-0.55), fontsize=18)

# Bigram

In [None]:
df = pd.read_csv("../results/bigram_maes.csv")

fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))

models = ["DP", "NGG"]

colors = ["steelblue", "orange"]

for i in range(5):
    poe_df = df[df.Rule == "PoE"]
    for mod, col in zip(models, colors):
        currdf = poe_df[poe_df.Model == mod]
        axis[i].plot(currdf.J, currdf[currdf.columns[i]], color=col, label="PoE - {0}".format(mod))
    
    min_df = df[df.Rule == "MIN"]
    for mod, col in zip(models, colors):
        currdf = min_df[min_df.Model == mod]
        axis[i].plot(currdf.J, currdf[currdf.columns[i]], "--", color=col, label="MIN - {0}".format(mod))
        
    cms_df = df[df.Rule == "CMS"]
    axis[i].plot(cms_df.J, cms_df[cms_df.columns[i]], label="CMS", color="purple")
        
    #axis[i].plot(min_df.J, min_df[min_df.columns[i+4]], label="MIN")
    #axis[i].set_xlim(0, 1100)
    #axis[i].set_xticks(min_df.J)
    axis[i].set_title(min_df.columns[i], fontsize=20)
    axis[i].set_xlabel(r"$J$", fontsize=18)

axis[0].set_ylabel("MAE", fontsize=18)

axis[2].legend(loc="lower center", ncol=5, bbox_to_anchor=(0.5,-0.55), fontsize=18)
plt.savefig("../plots/bigrams_freq.pdf", bbox_inches="tight")

plt.show()

In [None]:
df = pd.read_csv("../results/bigram_cardinality.csv")
plt.plot(df.ndata, df.true_k, linewidth=3, label="True", color="red")
plt.plot(df.ndata, df.DP, label="DP")
plt.plot(df.ndata, df.NGG, label="NGG")
plt.xscale("log")
plt.xticks(df.ndata, df.ndata)
plt.xlabel("n", fontsize=18)
plt.legend(loc="lower left", ncol=3, fontsize=18, bbox_to_anchor=(0.0, -0.30))
plt.title("Bigrams - True and Estimated $K_n$", fontsize=20)
plt.savefig("../plots/bigrams_kn.pdf", bbox_inches="tight")

In [None]:
df = pd.read_csv("../results/bigram_cardinality.csv")
df

# DNA

In [None]:
df = pd.read_csv("../results/dna_maes.csv")
df.head()

In [None]:
df = pd.read_csv("../results/dna_maes.csv")

fig, axis = plt.subplots(nrows=1, ncols=5, figsize=(18, 3))

models = ["DP", "NGG"]

colors = ["steelblue", "orange"]


poe_df = df[df.Rule == "PoE"]
for mod, col in zip(models, colors):
    currdf = poe_df[poe_df.Model == mod]
    for i in range(5):
        axis[i].plot(currdf.J, currdf[currdf.columns[i]], color=col, label="PoE - {0}".format(mod))
        axis[i].set_title(currdf.columns[i], fontsize=20)

min_df = df[df.Rule == "MIN"]
for mod, col in zip(models, colors):
    currdf = min_df[min_df.Model == mod]
    for i in range(5):
        axis[i].plot(currdf.J, currdf[currdf.columns[i]], "--", color=col, label="MIN - {0}".format(mod))
        
# cms_df = df[df.Rule == "CMS"]
# for i in range(5):
 #   axis[i].plot(cms_df.J, cms_df[cms_df.columns[i]], label="CMS", color="purple")
        
    #axis[i].plot(min_df.J, min_df[min_df.columns[i+4]], label="MIN")
    #axis[i].set_xlim(0, 1100)
    #axis[i].set_xticks(min_df.J)
    
for i in range(5):
    axis[i].set_xlabel(r"$J$", fontsize=18)
    
axis[0].set_ylabel("MAE", fontsize=18)

axis[2].legend(loc="lower center", ncol=5, bbox_to_anchor=(0.5,-0.55), fontsize=18)
plt.savefig("../plots/dna_freq.pdf", bbox_inches="tight")

plt.show()

cms > [200, 300, 1000, 5000, 10000] a seconda dei buckets

In [None]:
df = pd.read_csv("../results/dna_cardinality.csv")
plt.plot(df.ndata, df.true_k, linewidth=3, label="True", color="red")
plt.plot(df.ndata, df.DP, label="DP")
plt.plot(df.ndata, df.NGG, label="NGG")
plt.xscale("log")
plt.xlabel("n", fontsize=18)
# plt.xticks(df.ndata, df.ndata)
plt.legend(loc="lower left", ncol=3, fontsize=18, bbox_to_anchor=(0.0, -0.30))
plt.title("DNA - True and Estimated $K_n$", fontsize=20)
plt.savefig("../plots/dna_kn.pdf", bbox_inches="tight")