In [1]:
import pandas as pd
import numpy as np
import pickle
import csv
import time
import os
from numba import njit
os.chdir('..')
from qmc import QMC
from mmnl import MMNL
os.chdir('./data')
import matplotlib.pyplot as plt

Load data in the format used by k. Train.

In [2]:
def load_data(path):
    dat = np.load(path)
    X = dat[:, :-1]
    Y = np.reshape(dat[:, -1], (-1, 1))
    return X, Y
X, Y = load_data('data.npy')

In [151]:
print(X)

[[  1.          0.          0.        ...   3.7         5.2000001
    3.4000002]
 [  1.          0.          0.        ...   4.3000001   5.2000001
    4.4      ]
 [  1.          0.          0.        ...   2.5         4.6
    4.8      ]
 ...
 [300.          0.          0.        ...   3.5999998   5.0000001
    2.5      ]
 [300.          0.          0.        ...   3.7         5.0000001
    2.8000001]
 [300.          0.          0.        ...   3.7         3.5
    3.4000002]]


In [3]:
def utilities(X, beta):
    #performs matrix product to obtain the probability of every row
    #X should be in format [display, feature, price]
    try:
        assert(X.shape == (11192, 5) and beta.shape == (3,300))
    except AssertionError:
        raise AssertionError('Ga X ff in juiste format gooien. X: %s, beta: %s' %(X.shape,beta.shape))
    beta_choice = np.zeros((3,11192))
    for i in range(11192):
        id = int(X[i,0])
        beta_choice[:,i] = beta[:,id-1]
    eps = np.random.gumbel(size=(11192,))
    P = (X[:,2:]@ beta_choice)[:,0] + eps
#     try:
#         p = P[0:4]
#         check = p/np.sum(p)
#         assert(np.sum(check) == 1. or np.sum(check) == 1)
#     except AssertionError:
#         raise AssertionError('Kansen van eerste aankoop sommeren niet naar 1 %f'%(np.sum(check)))

    Y = []
    for i in range(0,len(P), 4):
        choice = np.argmax(P[i:i+4])
#         p = P[i:i+4]
#         check = p/np.sum(p)
#         print(np.sum(check))
        Y.append(int(choice))
    return np.array(Y)

In [4]:
# @njit
def probs(X,beta):
    #performs matrix product to obtain the probability of every row
    #X should be in format [display, feature, price]
    if not (X.shape == (11192, 5) and beta.shape == (3,300)):
        raise AssertionError('Ga X ff in juiste format gooien.')
    beta_choice = np.zeros((3,11192))
    for i in range(11192):
        id = int(X[i,0])
        beta_choice[:,i] = beta[:,id-1]   
    P = np.exp((X[:,2:]@ beta_choice)[:,0])
        
    if not (P.shape == (11192,)):
        raise AssertionError('Product van X en beta gaat niet goed.')
#     try:
#         p = P[0:4]
#         check = p/np.sum(p)
#         assert(np.sum(check) == 1. or np.sum(check) == 1)
#     except AssertionError:
#         raise AssertionError('Kansen van eerste aankoop sommeren niet naar 1 %f'%(np.sum(check)))

    Y = np.zeros((2798,))
    for i in range(0,11192, 4):
        sum = P[i:i+4].sum(axis=0)
        Y[i//4] = (P[i:i+4]/sum).argmax(axis=0)
#         p = P[i:i+4]
#         check = p/np.sum(p)
#         print(np.sum(check))
    return Y

In [160]:
%%time
def dgp(X: np.ndarray, D, method):
    #X: dataset
    #D: amount of datasets
    np.random.seed(123)
    theta = np.array([1.5,  1.,  -1.1,  0.4,  0.1,  0.6])
    Y_array = np.zeros((2798,D))
    P = np.zeros((11192,D))
    if method == 'QMC':
        delta = QMC((300,3,D))
    elif method == 'SMC':
        delta = np.random.standard_normal((300,3,D))
    print(method)
    beta = theta[:3].reshape(-1,1) + delta * theta[3:].reshape(-1,1)
    for t in range(X.shape[0]):
        obs = int(X[t,0])-1
        x = np.zeros((4,3))
        k = 0
        for i in range(1, 10, 4):
            x[:,k] = np.array([X[obs, i + j] for j in range(4)])
            k+=1
        for brand in range(4):
            P[t*4+brand,:] = (x[brand,:] @ beta[obs,:,:] +np.random.gumbel())
    
    for d in range(D):
        Y = np.zeros((2798,))
        for i in range(0,11192, 4):
            Y[i//4] = np.argmax(P[i:i+4,d])
        Y_array[:,d] = Y
            
            
    print(Y_array)
    return Y_array, P
    

Wall time: 0 ns


In [162]:
%%time
D = 10
Y_dgp, P= dgp(X,D,'SMC')

SMC
[[3. 3. 3. ... 3. 3. 3.]
 [2. 2. 3. ... 3. 2. 3.]
 [1. 1. 3. ... 3. 3. 3.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
Wall time: 165 ms


In [132]:
for p in range(P.shape[1]):
    print(P[:4,p])

[0.04059729 0.31788023 0.01029562 0.63122685]
[0.04692733 0.3237661  0.01294858 0.61635798]
[0.14304796 0.33345306 0.08136662 0.44213235]
[0.27515738 0.20427372 0.33560387 0.18496504]
[0.20373264 0.29887196 0.15780141 0.33959399]
[0.24055119 0.26235982 0.2270288  0.2700602 ]
[0.13206429 0.33688725 0.07073797 0.46031049]
[0.22281235 0.28186039 0.1904917  0.30483556]
[0.10946538 0.34106814 0.05131376 0.49815271]
[0.1671012  0.32294091 0.10770037 0.40225751]


In [None]:
pickle.dump( dicter, open( "%i_%s_dgp.p"%(D,method), "wb" ) )

In [12]:
dgp_tip = pickle.load(open('1000_Y_dgp_tim.p', 'rb'))

In [13]:
dgp_tip

array([[3., 3., 0., ..., 0., 1., 2.],
       [3., 3., 1., ..., 0., 1., 3.],
       [1., 1., 1., ..., 0., 3., 1.],
       ...,
       [3., 3., 3., ..., 3., 3., 3.],
       [3., 3., 3., ..., 3., 3., 1.],
       [2., 2., 1., ..., 2., 1., 2.]])

In [69]:
np.sum(np.abs(Y_dgp - dgp_tip[:,:500]))

1059308.0

In [166]:
P[:4,:]

array([[ -1.57445926,   0.50482387,  -6.19249801,  -0.36430837,
         -7.07030717,  -5.36979068,  -1.10598619,  -7.55275526,
         -3.9961862 ,  -5.98804004],
       [ -3.88960745,  -2.21714059,  -7.60411688,  -2.91622521,
         -8.31018077,  -6.94237403,  -3.51279216,  -8.69823684,
         -5.83751825,  -7.43966156],
       [ -4.4219143 ,  -2.07142028,  -9.64230603,  -3.05391761,
        -10.63461206,  -8.71228903,  -3.89233603, -11.17998817,
         -7.15951872,  -9.41117963],
       [  1.99893109,   3.53579263,  -1.41440209,   2.8933905 ,
         -2.0632176 ,  -0.80631403,   2.34519382,  -2.41980969,
          0.2089589 ,  -1.26328098]])