In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.interpolate import interp1d
from fractions import Fraction
from empiricaldist import Pmf, Cdf
import matplotlib.pyplot as plt

In [2]:
def read_data(filename):
    df = pd.read_csv(Path.cwd() / 'data' / filename, index_col=0, skiprows=[1])
    return df.dropna().T

In [3]:
df2011 = read_data('showcases.2011.csv')
df2012 = read_data('showcases.2012.csv')

In [4]:
df = pd.concat([df2011, df2012], ignore_index=True)
df.head()

Unnamed: 0,Showcase 1,Showcase 2,Bid 1,Bid 2,Difference 1,Difference 2
0,50969.0,45429.0,42000.0,34000.0,8969.0,11429.0
1,21901.0,34061.0,14000.0,59900.0,7901.0,-25839.0
2,32815.0,53186.0,32000.0,45000.0,815.0,8186.0
3,44432.0,31428.0,27000.0,38000.0,17432.0,-6572.0
4,24273.0,22320.0,18750.0,23000.0,5523.0,-680.0


In [5]:
df.shape

(313, 6)

In [6]:
def kde_from_sample(sample, qs):
    kde = ss.gaussian_kde(sample)
    ps = kde(qs)
    pmf = Pmf(ps, qs)
    pmf.normalize()
    return pmf

In [32]:
qs = np.linspace(0, 80000, 81)
prior1 = kde_from_sample(df['Showcase 1'], qs)
prior2 = kde_from_sample(df['Showcase 2'], qs)
prior1

Unnamed: 0,probs
0.0,7.084244e-20
1000.0,2.993201e-18
2000.0,1.045021e-16
3000.0,3.017285e-15
4000.0,7.211458e-14
...,...
76000.0,3.504494e-17
77000.0,1.015177e-18
78000.0,2.419653e-20
79000.0,4.745246e-22


In [17]:
sample_diff1 = df['Bid 1'] - df['Showcase 1']
sample_diff2 = df['Bid 2'] - df['Showcase 2']

In [18]:
qs = np.linspace(-40000, 20000, 61)
kde_diff1 = kde_from_sample(sample_diff1, qs)
kde_diff2 = kde_from_sample(sample_diff2, qs)

In [19]:
mean_diff1 = sample_diff1.mean()
std_diff1 = sample_diff1.std()
mean_diff2 = sample_diff2.mean()
std_diff2 = sample_diff2.std()

In [33]:
error_dist1 = ss.norm(0, std_diff1)
error_dist2 = ss.norm(0, std_diff2)
error_dist1.cdf(-100)

0.49421835464466

In [34]:
guess1 = 23000
guess2 = 38000
error1 = guess1 - prior1.qs
error2 = guess2 - prior2.qs

In [35]:
likelihood1 = error_dist1.pdf(error1)
likelihood2 = error_dist2.pdf(error2)

In [31]:
posterior1 = prior1 * likelihood1
posterior1.normalize()
posterior1.head()

Unnamed: 0,probs
0.0,4.671763e-22
1000.0,3.166443e-20
2000.0,1.7365500000000002e-18


In [36]:
posterior2 = prior2 * likelihood2
posterior2.normalize()
posterior2.head()

Unnamed: 0,probs
0.0,8.205771e-21
1000.0,2.499936e-19
2000.0,6.4999520000000004e-18


In [37]:
def prob_overbid(sample_diff):
    return np.mean(sample_diff > 0)

In [39]:
def prob_worse_than(diff, sample_diff):
    return np.mean(sample_diff < diff)

In [40]:
def compute_prob_win(diff, sample_diff):
    if diff > 0:
        return 0
    
    p1 = prob_overbid(sample_diff)
    p2 = prob_worse_than(diff, sample_diff)

    return p1 + p2

In [41]:
xs = np.linspace(-30000, 5000, 121)
ys = [compute_prob_win(x, sample_diff2) for x in xs]

In [42]:
def total_prob_win(bid, posterior, sample_diff):
    total = 0
    for price, prob in posterior.items():
        diff = bid - price
        total += prob * compute_prob_win(diff, sample_diff)
    return total

In [43]:
bids = posterior1.qs
probs = [total_prob_win(bid, posterior1, sample_diff2) for bid in bids]
prob_win_series = pd.Series(probs, index=bids)
prob_win_series

0.0        2.971297e-01
1000.0     2.983131e-01
2000.0     2.998251e-01
3000.0     3.017554e-01
4000.0     3.042448e-01
               ...     
76000.0    9.298362e-30
77000.0    8.740531e-32
78000.0    6.621979e-34
79000.0    4.043155e-36
80000.0    1.982049e-38
Length: 81, dtype: float64

In [44]:
def compute_gain(bid, price, sample_diff):
    diff = bid - price
    prob = compute_prob_win(diff, sample_diff)

    if -250 <= diff <= 0:
        return 2 * price * prob
    else:
        return price * prob

In [45]:
def expected_gain(bid, posterior, sample_diff):
    total = 0
    for price, prob in posterior.items():
        total += prob * compute_gain(bid, price, sample_diff)
    return total