In [1]:
import numpy as np
from scipy.stats import ks_2samp, ks_1samp, kstest
import json
import pandas as pd
import matplotlib.pyplot as plt
from random import choices
import os

In [2]:
import re
def match_regex_array(files, regex):
    matches = [re.findall(regex, f) for f in files]
    matches = ["./sim_results/" + m[0] for m in matches if len(m) > 0 ]
    return matches

In [5]:
def custom_kstest(ref_dist, test_dist, population, N_bots = 100):
    p_values = []
    ref_dist_large_samples = choices(population, ref_dist, k=10**6)
    for i in range(N_bots):
        test_dist_samples = choices(population, test_dist, k=(10**3))
        ks = ks_2samp(ref_dist_large_samples, test_dist_samples)
        p_values.append(ks.pvalue)
    p_values_mean = np.mean(p_values)
    times_null_hypothesis_rejected = [1 if p_value < 0.05 else 0 for p_value in p_values]
    return p_values_mean, sum(times_null_hypothesis_rejected)/len(times_null_hypothesis_rejected)

In [6]:
data_lastfm = json.load(open("./sim_results/sim_results_Ulastfm_pop0_a0.5_N2_C5_CPtop_Q0.5_L40.json"))
lastfm_size = len(data_lastfm['p0'])
data_movielens = json.load(open("./sim_results/sim_results_Umovielens1k_pop0_a0.5_N2_C5_CPtop_Q0.5_L1.json"))
movielens_size = len(data_movielens['p0'])
population_movielens = np.arange(movielens_size)
population_lastfm = np.arange(lastfm_size)

In [7]:
# Consistency Check - we should see a small p-value when comparing samples from pi_NA 
# and a high p-value when using samples from pi_bs
print("KStest between million samples bs and a small sample bs (expected high p-value)")
print(custom_kstest(data_movielens['pi_bs'], data_movielens['pi_bs'], population_movielens))
print("KStest between million samples bs and a small sample NA (expected low p-value)")
print(custom_kstest(data_movielens['pi_bs'], data_movielens['pi_final_NA'], population_movielens))

KStest between million samples bs and a small sample bs (expected high p-value)
(0.537764559680241, 0.06)
KStest between million samples bs and a small sample NA (expected low p-value)
(3.6308359886988073e-10, 1.0)


In [17]:
def kstest_file(filename):
    try:
        with open(filename, 'r') as f:
            DATA = json.load(f)
        population = np.arange(len(DATA['p0']))
        file_result = open("./kstests/baseline/" + filename.split("/")[-1], "w")
        p_values_mean, rejection_proportion = custom_kstest(DATA['pi_bs'], DATA['pi_bs'], population)
        file_result.write("p_values_mean: {}, rejection proportion: {}".format(p_values_mean, rejection_proportion))
        file_result.close()
    except Exception as e:
        print(e)

In [None]:
files = os.listdir('./sim_results/')
movielens_simulations_files = match_regex_array(files, r'sim_results_Umovielens.*')
lastfm_simulations_files = match_regex_array(files, r'sim_results_Ulastfm.*')
ks_movielens = []
i = 0
end = movielens_size
for filename in movielens_simulations_files:
    print("{}/{}".format(i, end))
    kstest_file(filename)
    i+=1

0/1060
1/1060
2/1060
3/1060
4/1060
5/1060
6/1060
7/1060
8/1060
9/1060
10/1060
11/1060
12/1060
13/1060
14/1060
15/1060
16/1060


In [139]:
from scipy import stats
rv = stats.expon(scale=1/0.247)
rv.cdf(0.7)

0.1587782671880641

In [178]:
exp_samples = stats.expon(scale=1/0.247).rvs(size=10**6)

In [179]:
ks_1samp(exp_samples, stats.expon(scale=1/0.247).cdf)

KstestResult(statistic=0.000453357937850285, pvalue=0.9862728719610566)

In [181]:
# it seems that KStest doesn't work very well for discrete cdf. p value not making sense. It should be high.
def cdf_(X):
    if (type(X) == int):        
        r = sum(data_movielens['pi_bs'][:X])
    else:
        r = [sum(data_movielens['pi_bs'][:x]) for x in X]
    return r
population = np.arange(movielens_size)
samples = choices(population, data_movielens['pi_bs'], k=10**6)
ks_1samp(samples, cdf_)