In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import scipy.stats


df = pd.read_csv('TuberWeights_13.07.18.csv')
df_soraya = df.loc[df['Variety'] == 'Soraya'].copy()
df_venezia = df.loc[df['Variety'] == 'Venezia'].copy()

df_soraya = df_soraya[['MidSize', 'TuberWeight', 'CubeRoot']]
df_venezia = df_venezia[['MidSize', 'TuberWeight', 'CubeRoot']]

def extract_samples(df, elem='TuberWeight'):
    samples = {}
    for mid_size, tuber_weight in zip(df['MidSize'], df[elem]):
        if mid_size in samples:
            samples[mid_size].append(tuber_weight)
        else:
            samples[mid_size] = [tuber_weight]
    return samples

samples_soraya_w = extract_samples(df_soraya)
samples_soraya_cr = extract_samples(df_soraya, 'CubeRoot')

samples_venezia_w = extract_samples(df_venezia)
samples_venezia_cr = extract_samples(df_venezia, 'CubeRoot')
        
true_dist_soraya = [(x, len(samples_soraya_w[x])) for x in samples_soraya_w]
true_dist_venezia = [(x, len(samples_venezia_w[x])) for x in samples_venezia_w]


In [54]:
def create_batches(list_in, alpha=0, theta=1):
    current_index = 0
    batch_list = []
    
    assert (0 <= alpha and alpha < 1) and theta > -alpha
    for n, weight in enumerate(list_in):
        p = np.random.binomial(1, p=(theta+len(batch_list)*alpha)/(n+theta))
        if(p == 1):
            batch_list.append([weight])
        else:
            probabilities = np.asarray([len(x)-alpha for x in batch_list], dtype='float32')
            probabilities = probabilities/probabilities.sum()
            table = np.random.choice(range(len(batch_list)), 1, p=probabilities)[0]
            batch_list[table].append(weight)
    batch_list = [(len(x), np.sum(x)) for x in batch_list]
    return batch_list

# print(create_batches(samples_soraya_w[17.5], alpha=0.2))

band_to_samples_soraya_w = dict([(x, create_batches(samples_soraya_w[x], alpha=0.2)) for x in samples_soraya_w])
band_to_samples_soraya_cr = dict([(x, create_batches(samples_soraya_cr[x], alpha=0.2)) for x in samples_soraya_cr])

band_to_samples_venezia_w = dict([(x, create_batches(samples_venezia_w[x], alpha=0.2)) for x in samples_venezia_w])
band_to_samples_venezia_cr = dict([(x, create_batches(samples_venezia_cr[x], alpha=0.2)) for x in samples_venezia_cr])


In [55]:
def pi_estimate(band_to_samples, smoothing=0):
    pi_sum = {}
    num_potatoes = 0
    for band in band_to_samples:
        total_sum = 0
        for (batch_size, batch_weight) in band_to_samples[band]:
            total_sum += batch_size
        pi_sum[band] = total_sum+smoothing
        num_potatoes += total_sum+smoothing
    pi_dist = {}
    for band in pi_sum:
        pi_dist[band] = pi_sum[band]/num_potatoes
    return pi_dist

def estimate_mu(band_to_samples):
    mu = {}
    for band in band_to_samples:
        total_weight = 0
        total_potatoes = 0
        for (batch_size, batch_weight) in band_to_samples[band]:
            total_weight += batch_weight
            total_potatoes += batch_size
        mu[band] = total_weight/total_potatoes
    return mu

def estimate_var(band_to_samples, mu_estimate):
    var = {}
    for band in band_to_samples:
        square_diff_sum = 0 
        for (batch_size, batch_weight) in band_to_samples[band]:
            square_diff_sum += (batch_weight - batch_size*mu_estimate[band])**2/batch_size
        var[band] = max(square_diff_sum/len(band_to_samples[band]), 1e-2)
    return var


def get_posterior(weight, band_to_samples, pi_estimate, mu_estimate, var_estimate):
    post_prop = {}
    sum_post = 0
    post = {}
    for band in band_to_samples:
        norm_dist = scipy.stats.norm(mu_estimate[band], var_estimate[band]**(1/2))
        post_prop[band] = norm_dist.pdf(weight)*pi_estimate[band]
        sum_post += post_prop[band]

    for band in post_prop:
        post[band] = post_prop[band]/sum_post
    return post

def get_argmax(posterior):
    key_max = None
    max_value = 0.0
    for band in posterior:
        if posterior[band] > max_value:
            max_value = posterior[band]
            key_max = band
    return key_max

def kl(p, q):
    """Kullback-Leibler divergence D(P || Q) for discrete distributions
    Parameters
    ----------
    p, q : array-like, dtype=float, shape=n
    Discrete probability distributions.
    """
    p = np.asarray(p, dtype=np.float)
    q = np.asarray(q, dtype=np.float)

    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

def calculate_KL(y_pred, y_true):
    y_pred = dict(y_pred)
    y_true = dict(y_true)
    
    y_pred_dist = []
    y_true_dist = []
    
    for i in y_pred:
        y_pred_dist.append(y_pred[i])
        y_true_dist.append(y_true[i])
        
    y_pred_dist = np.asarray(y_pred_dist)
    y_pred_dist = y_pred_dist/y_pred_dist.sum()
    
    y_true_dist = np.asarray(y_true_dist)
    y_true_dist = y_true_dist/y_true_dist.sum()
    
    return kl(y_true_dist, y_pred_dist)

def calculate_W(y_pred, y_true):
    y_pred = dict(y_pred)
    y_true = dict(y_true)
    
    y_pred_dist = []
    y_true_dist = []
    
    for i in y_pred:
        y_pred_dist.append(y_pred[i])
        y_true_dist.append(y_true[i])
        
    y_pred_dist = np.asarray(y_pred_dist)
    y_pred_dist = y_pred_dist/y_pred_dist.sum()
    
    y_true_dist = np.asarray(y_true_dist)
    y_true_dist = y_true_dist/y_true_dist.sum()
    
    return scipy.stats.wasserstein_distance(y_true_dist, y_pred_dist)



In [56]:
def get_pseudo_counts(band_to_samples, count_df):
    pi = pi_estimate(band_to_samples, smoothing=0)
    mu = estimate_mu(band_to_samples)
    var = estimate_var(band_to_samples, mu)
    
    potato_counts = {}

    for weight in count_df:
        posterior = get_posterior(weight, band_to_samples, pi, mu, var)
        for band in posterior:
            if band in potato_counts:
                potato_counts[band]+= posterior[band]
            else:
                potato_counts[band] = posterior[band]
    return potato_counts

In [57]:
pseudo_v_w = get_pseudo_counts(band_to_samples_venezia_w, df_venezia['TuberWeight'])
pseudo_v_cr = get_pseudo_counts(band_to_samples_venezia_cr, df_venezia['CubeRoot'])

print('True Distribution : \n', dict(true_dist_venezia))
print()
print('Estimated distribution based on TuborWeight :\n', pseudo_v_w)
print('KL Divergence of categorical distribution :\n', calculate_KL(pseudo_v_w, true_dist_venezia))
print('W Distance of categorical distribution :\n', calculate_W(pseudo_v_w, true_dist_venezia))
print()
print('Estimated distribution based on CubeRoot :\n', pseudo_v_cr)
print('KL Divergence of categorical distribution :\n', calculate_KL(pseudo_v_cr, true_dist_venezia))
print('W Distance of categorical distribution :\n', calculate_W(pseudo_v_cr, true_dist_venezia))

True Distribution : 
 {17.5: 28, 22.5: 59, 27.5: 72, 32.5: 54, 37.5: 63, 42.5: 25, 47.5: 2}

Estimated distribution based on TuborWeight :
 {17.5: 26.870576673341414, 22.5: 61.023965772208626, 27.5: 76.32950092256951, 32.5: 41.10413114277664, 37.5: 70.4808530211447, 42.5: 27.190972467959096, 47.5: 1.6143638852873784e-150}
KL Divergence of categorical distribution :
 2.2829322359399264
W Distance of categorical distribution :
 0.014808954633912543

Estimated distribution based on CubeRoot :
 {17.5: 30.873252520808702, 22.5: 53.746416596272155, 27.5: 80.4873216877555, 32.5: 42.71441158231843, 37.5: 66.67151183037393, 42.5: 26.562363908785354, 47.5: 1.9447218736859675}
KL Divergence of categorical distribution :
 0.007842190690132363
W Distance of categorical distribution :
 0.01564776044104051


In [66]:
for x in pseudo_v_w:
    print(round(pseudo_v_cr[x],2))

30.87
53.75
80.49
42.71
66.67
26.56
1.94


In [59]:
pseudo_s_w = get_pseudo_counts(band_to_samples_soraya_w, df_soraya['TuberWeight'])
pseudo_s_cr = get_pseudo_counts(band_to_samples_soraya_cr, df_soraya['CubeRoot'])

print('True Distribution : \n', dict(true_dist_soraya))
print()
print('Estimated distribution based on TuborWeight :\n', pseudo_s_w)
print('KL Divergence of categorical distribution :\n', calculate_KL(pseudo_s_w, true_dist_soraya))
print('W Distance of categorical distribution :\n', calculate_W(pseudo_s_w, true_dist_soraya))

print()
print('Estimated distribution based on CubeRoot :\n', pseudo_s_cr)
print('KL Divergence of categorical distribution :\n', calculate_KL(pseudo_s_cr, true_dist_soraya))
print('W Distance of categorical distribution :\n', calculate_W(pseudo_s_cr, true_dist_soraya))


True Distribution : 
 {17.5: 11, 22.5: 3, 27.5: 2, 32.5: 3, 37.5: 13, 42.5: 16, 47.5: 37, 52.5: 46, 57.5: 22, 62.5: 11, 67.5: 1}

Estimated distribution based on TuborWeight :
 {17.5: 10.759406082964086, 22.5: 3.2509340202938404, 27.5: 1.500107259585127, 32.5: 3.9491272247045384, 37.5: 14.207404274656588, 42.5: 15.353980196160027, 47.5: 34.22395247551483, 52.5: 44.31947870555991, 57.5: 24.74723000928103, 62.5: 11.689772407610773, 67.5: 0.998607343669275}
KL Divergence of categorical distribution :
 0.003630350777927744
W Distance of categorical distribution :
 0.0064401850540460204

Estimated distribution based on CubeRoot :
 {17.5: 9.341209068777237, 22.5: 4.6420718632899955, 27.5: 2.0169878797348617, 32.5: 3.920397409488778, 37.5: 13.24101268605029, 42.5: 15.045432193586851, 47.5: 41.87506031067342, 52.5: 35.97947366089835, 57.5: 25.632384600454095, 62.5: 12.243060596946329, 67.5: 1.0629097300998527}
KL Divergence of categorical distribution :
 0.015368694943750988
W Distance of cate

9.34
4.64
2.02
3.92
13.24
15.05
41.88
35.98
25.63
12.24
1.06
