# First, 
we declare some helper functions.

### Component mappings
We map from solver component (a component described using year, ct, en, sv, gb and component code) to a general component (described using component type and component code)

In [None]:
from graphlib.types import Types
import numpy as np

def solver_comps_to_comps(solver_cmps):
    
    cmps = []
    for scmp in solver_cmps:
        try:
            _,t,c = scmp.split('__')
        except:
            raise Exception("lol")
        if t == Types.PNO12:
            ct,en,sv,gb = c.split("_")
            cmps.append((ct, Types.CAR_TYPE, ct))
            cmps.append((ct, Types.ENGINE, en))
            cmps.append((ct, Types.SALES_VERSION, sv))
            cmps.append((ct, Types.GEARBOX, gb))
        else:
            _,ct,_,_,_,code = c[:4], c[4:7], c[7:9], c[9:11], c[11:12], c[12:]
            cmps.append((ct,t,code))
            
    return cmps

def solver_comps_to_vector(solver_cmps):
    w = 0
    for cmp in solver_comps_to_comps(solver_cmps):
        w += weights[comp_space['comp_idx'][cmp]]
        
    return w

### Solver
The solver function takes a weight vector $w$, a weight matrix $ws$ (containing a weight per component), the car type $ct$, the rules set (A_ubs,...,b_eqs) and a mapping from component index $i$ to a component tuple

In [None]:
from gurobipy import *
from tqdm import tqdm_notebook
import cvxpy as cp

def is_satisfied(c, A_ub, b_ub, A_eq, b_eq):
        
    ub_TT = np.dot(A_ub,c).T <= b_ub
    eq_TT = np.dot(A_eq,c).T == b_eq

    f_TT = np.all(ub_TT) and np.all(eq_TT)

    return f_TT

def get_solver_data(ct, css):
    return css[ct].i2c, css[ct].A_ubs, css[ct].b_ubs, css[ct].A_eqs, css[ct].b_eqs
    
def solve(w, ws, ct, A_ubs, b_ubs, A_eqs, b_eqs, i2c):
    
    n = len(i2c)
    
    # Define and solve the CVXPY problem.
    x = cp.Variable(n, boolean=True)
    prob = cp.Problem(cp.Minimize(cp.norm(ws[ct].T@x - w)), [
        A_ubs@x <= b_ubs, 
        A_eqs@x == b_eqs
        ])
    prob.solve(solver=cp.GUROBI)

    try:
        items = i2c[np.argwhere(x.value == 1).T[0].tolist()]
        res = (prob.status, items, x.value, prob.value)
    except:
        res = (prob.status, None, None, None)

    return res

### The $css$
contains the rules set and all components on solver component level. The css is general for all algorithms. The css is indexed using car types and then contains the component to index, and vice versa, mapping.

In [None]:
from graphlib.component_graph import ComponentGraph
from graphlib.component_solver import ComponentSolver

cts = ["225","236","536","525","246","256","235","234","227","526","224"]
cg = ComponentGraph("bolt://localhost:7687", "neo4j", "")

css = {ct:ComponentSolver(cg.setup_rule_system(ct)) for ct in cts}

### Mappings
We need to define some mappings. First, we need to construct a dict of all the components on a general level, then give an index to each one of those (c2i, i2c). We also will need a mapping from solver components to general components (cs2cf).

In [None]:
from graphlib.MDict import MDict

cmps = set()
cs2cf = {}
for ct in css.keys():
    for cmp in css[ct].c2i.keys():
        _,t,c = cmp.split('__')
        if t == Types.PNO12:
            ct,en,sv,gb = c.split('_')
            n_cmps = [(Types.CAR_TYPE, ct), (Types.ENGINE, en), (Types.SALES_VERSION, sv), (Types.GEARBOX, gb)]
            cmps.update(set(n_cmps))
            cs2cf[cmp] = n_cmps
        else:
            code = c[12:]
            n_c = (t,code)
            cmps.add(n_c)
            cs2cf[cmp] = n_c

cmps_list = list(cmps)

i2c = {i:cmps_list[i] for i in range(len(cmps_list))}
c2i = {c:i for i,c in i2c.items()}

i2c = MDict(i2c)
c2i = MDict(c2i)
cs2cf = MDict(cs2cf)

### Global vars

In [None]:
# All current car types
cts = ["225","236","536","525","246","256","235","234","227","526","224"]

# ct to idx mappings
ct2i = {cts[i]:i for i in range(len(cts))}
i2ct = {i:ct for ct,i in ct2i.items()}

### The test function
The algorithm is taking a list of contexts and cars, where the map_car2idxs function maps a car to its general indices, a weight matrix, the ct model predicting a car type and a point model predicting a point in the component space (cmp2vec or distributional)

In [None]:
def test(ctxs_test, cars_test, ws, ct_model, p_model, map_car2vec=None, n_samples=None, ct_vecs=None):
    
    if map_car2vec == None:
        raise Exception("'map_car2idxs' must be supplied")

    not_satisfied = []
    ct_pred_dist = {}
    y_s, y_preds = [],[]
    
    pairs = list(zip(ctxs_test, cars_test))
    n_samples = n_samples if not n_samples == None else len(pairs)
    current_pairs = pairs[:n_samples]
    np.random.shuffle(current_pairs)
    
    for i in tqdm_notebook(range(len(current_pairs))):
        ctx, car = current_pairs[i]
        y = map_car2vec(car)
        
        if ct_vecs == None:
            ct = np.random.choice(cts, p=ct_model.predict(ctx.reshape(1,-1))[0])
            ct_pred = np.zeros((len(ct2i)))
            ct_pred[ct2i[ct]] = 1
            p_pred = p_model.predict([[ct_pred], [ctx]]).T[0]
        else:
            ct, p_pred = ct_vecs[i]

        i2c_, A_ubs, b_ubs, A_eqs, b_eqs = get_solver_data(ct, css)
        p, scmps, x, d = solve(p_pred, ws, ct, A_ubs, b_ubs, A_eqs, b_eqs, i2c_)

        is_sat = is_satisfied(x, A_ubs, b_ubs, A_eqs, b_eqs)
        if not is_sat:
            print("Item {} was not satisfied".format(i))
            not_satisfied.append((i, is_sat))
            continue

        y_pred = np.zeros((len(i2c)))
        for cs in scmps:
            cf = cs2cf[cs]
            idx = c2i[cf]
            y_pred[idx] = 1

        y_s.append(y)
        y_preds.append(y_pred)
        
    loss = np.mean(np.square(np.array(y_s), np.array(y_preds)))
    
    return loss

# Algorithm 1b
Here we will make use of the item2vec (1d) weights and solver to find a decent car to a user context. First, the algorithm predicts a car type $ct$ and then a vector $v$ in the car space. The solver then takes $v$ and $ct$ as input and finds the nearest car. 

First, we get the component space along with the translated cars and ctx pairs.

## Dataset 1: compenent2vec
First, we load the components generated from the component2vec method along with the weights for each component

In [None]:
import pickle
import numpy as np

dims = 1

X_train, X_test, y_train_cmps, y_test_cmps = pickle.load(open("data/training_test_data/item2vec_dataset.pkl", "rb"))
comp_space = pickle.load(open("data/component_space_{}d_no_packages.pickle".format(dims), 'rb'))

In [None]:
d = {}
for cmps in y_train_cmps:
    _,_,cmp = cmps[0].split('__')
    ct = cmp[:3]
    if not ct in d:
        d[ct] = 0
        
    d[ct] += 1
    
e = {}
for cmps in y_train_cmps:
    u_cmps = tuple(cmps)
    if not u_cmps in e:
        e[u_cmps] = 0
    
    e[u_cmps] += 1
    
len(e.keys())

In [None]:
n_opts = []
for cmps in y_test_cmps:
    count = 0
    for c in cmps:
        _,t,_ = c.split("__")
        if t == Types.OPT:
            count += 1
        
    n_opts.append(count)
    
print(np.mean(n_opts))

### Create a weight matrix for solver components 
Here we create the weight matrix from the component space (generated from component2vec) such that it maps to each solver component in the $css$

In [None]:
ws_cmp2vec = {}
for ct in cts:
    if not ct in ws_cmp2vec:
        ws_cmp2vec[ct] = []
        
    for c in css[ct].i2c.values():
        _, t, cm = c.split('__')
        cms = cm.split('_')
        z = 0
        if len(cms) > 1:
            ct, en, sv, gb = cms
            for m in [(ct, Types.CAR_TYPE, ct), (ct, Types.ENGINE, en), (ct, Types.SALES_VERSION, sv), (ct, Types.GEARBOX, gb)]:
                z += weights[comp_space['comp_idx'][m]]
        else:
            _, ct, _, _, _, ocu = cms[0][:4], cms[0][4:7], cms[0][7:9], cms[0][9:11], cms[0][11:12], cms[0][12:]            
            z += weights[comp_space['comp_idx'][(ct, t, ocu)]]
            
        ws_cmp2vec[ct].append(z)
        
for ct in cts:
    ws_cmp2vec[ct] = np.array(ws_cmp2vec[ct])

In [None]:
comp_space.keys(), X_train.shape, len(y_train_cmps)

Define some key variables. Move weight space to positive side

In [None]:
weights = np.abs(np.min(comp_space['weights'])) + comp_space['weights']
np.min(weights), np.min(comp_space['weights'])

### Build train-test dataset for ct model
Here, we do not find the average car type within each context but rather compute for every car type in the context.

In [None]:
y_ct_train= []
for cmps in y_train_cmps:
    try:
        ct = cmps[0][9:12]

        ct_idx = ct2i[ct]
        z = np.zeros((len(ct2i)))
        z[ct_idx] = 1
        y_ct_train.append(z)
    except:
        print(cmps)
        
y_ct_train = np.array(y_ct_train)

y_ct_test = []
for cmps in y_test_cmps:
    ct = cmps[0][9:12]
    
    ct_idx = ct2i[ct]
    z = np.zeros((len(ct2i)))
    z[ct_idx] = 1
    y_ct_test.append(z)
y_ct_test = np.array(y_ct_test)

In [None]:
X_train[0], y_ct_train[0]

## Create car type model
This will take a context as input and predict one car type

In [None]:
import keras.backend as K


def euclidean_distance_loss(y_true, y_pred):
    """
    Euclidean distance loss
    https://en.wikipedia.org/wiki/Euclidean_distance
    :param y_true: TensorFlow/Theano tensor
    :param y_pred: TensorFlow/Theano tensor of the same shape as y_true
    :return: float
    """
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))


In [None]:
from keras.models import Model as KModel
from keras.layers import Dense, Input, concatenate
from keras.callbacks import TensorBoard,ReduceLROnPlateau,EarlyStopping
from keras.optimizers import Adam, sgd

def get_car_type_model(input_size, output_size):
    input_r = Input(shape=(input_size,))
    
    dense_hidden_layer = Dense(1000, activation='relu')(input_r)
    dense_hidden_layer = Dense(1000, activation='relu')(dense_hidden_layer)
    output_layer = Dense(output_size, activation='softmax')(dense_hidden_layer)
    
    model = KModel(inputs=[input_r], outputs=output_layer)
    model.name = "CAR_TYPE_MODEL"
    model.compile(loss='categorical_crossentropy', optimizer=sgd(lr=0.00001), metrics=['accuracy'])
    
    print(model.summary())
    return model

Compile car type model

In [None]:
ct_model = get_car_type_model(X_train.shape[1], y_ct_train.shape[1])

### Fit car type model

In [None]:
ct_model.fit(X_train, y_ct_train, validation_data=(X_test, y_ct_test), epochs=1000, callbacks=[
    EarlyStopping(monitor="loss", patience=5),
    ReduceLROnPlateau(monitor='loss',verbose=1)])

## Create point model training sets
We need to map each component in the training sets to a point in the component space. Since it is created using the l2-norm, we use the same.

In [None]:
y_p_train = []
for cmps in y_train_cmps:
        
    z = []
    for ct,t,c in solver_comps_to_comps(cmps):
        z.append(weights[comp_space['comp_idx'][(ct,t,c)]])

    z_norm = np.linalg.norm(z)
    y_p_train.append(z_norm)
    
y_p_test = []
for i in range(len(y_test_cmps)):
    cmps = y_test_cmps[i]

    z = []
    for ct,t,c in solver_comps_to_comps(cmps):
        z.append(weights[comp_space['comp_idx'][(ct,t,c)]])

    z_norm = np.linalg.norm(z)
    y_p_test.append(z_norm)
    
y_p_train = np.array(y_p_train)
y_p_test = np.array(y_p_test)

In [None]:
np.mean(y_p_train), np.var(y_p_train), np.max(y_p_train), np.min(y_p_train)

In [None]:
from seaborn import kdeplot

kdeplot(y_p_train)

Prepare the ct train and test set

In [None]:
X_ct_train = []
X_ct_test = []

for cmps in y_train_cmps:
    _,t,c = cmps[0].split("__")
    
    if not t == Types.PNO12:
        raise Exception("no, no")
            
    ct = c[:3]
    z = np.zeros((len(ct2i)))
    idx = ct2i[ct]
    z[idx] = 1
    X_ct_train.append(z)
    
for cmps in y_test_cmps:
    _,t,c = cmps[0].split("__")
    
    if not t == Types.PNO12:
        raise Exception("no, no")
            
    ct = c[:3]
    z = np.zeros((len(ct2i)))
    idx = ct2i[ct]
    z[idx] = 1
    X_ct_test.append(z)

X_ct_train = np.array(X_ct_train)
X_ct_test = np.array(X_ct_test)

In [None]:
from keras.layers import concatenate

def get_point_model(ct_input_size, ctx_input_size, output_size, loss='mse'):
    input_l = Input(shape=(ct_input_size,))
    input_r = Input(shape=(ctx_input_size,))
    
    dense_hidden_layer_l = Dense(1000, activation='relu')(input_l)
    dense_hidden_layer_r = Dense(1000, activation='relu')(input_r)
    
    concat_layer = concatenate([dense_hidden_layer_l, dense_hidden_layer_r])
    
    dense_hidden_layer = Dense(1000, activation='relu')(concat_layer)
    output_layer = Dense(output_size, activation='relu')(dense_hidden_layer)
    
    model = KModel(inputs=[input_l,input_r], outputs=output_layer)
    model.name = "POINT_MODEL"
    model.compile(loss=loss, optimizer=sgd(lr=0.00001))
    
    print(model.summary())
    return model

In [None]:
point_model = get_point_model(y_ct_train.shape[1], X_train.shape[1], 1, loss=euclidean_distance_loss)

In [None]:
p_history = point_model.fit([X_ct_train, X_train], y_p_train, validation_data=([X_ct_test, X_test], y_p_test), epochs=1000000, callbacks=[
    EarlyStopping(monitor="loss", patience=5),
    ReduceLROnPlateau(monitor='loss',verbose=1)])

## Generate car vectors from test set
First, we take the test set $y$ containing weights and regenerate cars using the solver. We then compare the generated component vectors with the correct in the test set.

In [None]:
def car2idxs_cmpvec(car):
    z = np.zeros((len(c2i)))
    c_cmps = cs2cf[car]
    idxs = c2i[c_cmps[0]]
    z[idxs] = 1
    
    return z

Algorithm 1. MSE <b>after solver</b>

There is a risk that the solver doesn't successfully find a satisfied components vector. In that case, we just discard those examples

In [None]:
z_ys, z_yps = test(X_test, y_test_cmps, ws_cmp2vec, ct_model, point_model, map_car2vec=car2idxs_cmpvec, n_samples=100)

In [None]:
np.mean(np.square(z_ys - z_yps))

## Test Random Car Picker
The algorithm works as follows. First, we calculate the distribution $D$ of number of components as a truncated normal distribution. Then, for every new car, we randomly pick an integer $k$ from $D$ and then chooses $k$ optional components randomly. For each mandatory component, we randomly pick one of each with equal probability (or pick one PNO34). 

First, calculate distribution for optional components

In [None]:
nD = []
for cmps in y_train_cmps:
    opts = []
    for cmp in cmps:
        _,t,c = cmp.split('__')
        if t == Types.OPT: 
            opts.append((t,c))
            
    n = len(opts)
    nD.append(n)
nD = np.array(nD)

In [None]:
l, h, mu, sd = np.min(nD), np.max(nD), np.mean(nD), np.var(nD)
l, h, mu, sd

In [None]:
from scipy.stats import truncnorm

opt_dist = truncnorm((l - mu) / sd, (h - mu) / sd, loc=mu, scale=sd)

In [None]:
n_opts = int(round(opt_dist.rvs()))
n_opts

Now, calculate how many items for each mandatory component type

In [None]:
cts = list(css.keys())
pno12s = MDict({})
ct_cpms = MDict({})
for ct in css.keys():
    ct_cpms[ct] = MDict({})
    pno12s[ct] = []
    for i in css[ct].i2c.keys():
        c_full = css[ct].i2c[i]
        _,t,c = c_full.split('__')
        if t == Types.PNO12:
            pno12s[ct].append(c_full)
        else:
            if not t in ct_cpms[ct]:
                ct_cpms[ct][t] = []
                
            ct_cpms[ct][t].append(c_full)

In [None]:
ct_cpms['227'][Types.COL]

In [None]:
np.random.choice(ct_cpms['225'][Types.COL])

### Test random algorithm
Now we are ready to test. We create a test set of size $|z_{ys}|$ and then compute mean squared error with $z_{ys}$

In [None]:
z_ryps = []
z_ryps_ss = []
m_types = [Types.COL, Types.UPH]

for i in range(z_ys.shape[0]):
    z_p_cmps = []
    
    ct = np.random.choice(cts)
    pno12 = np.random.choice(pno12s[ct])
    col = np.random.choice(ct_cpms[ct][Types.COL])
    uph = np.random.choice(ct_cpms[ct][Types.UPH])
    
    k = int(round(opt_dist.rvs()))
    opts = np.random.choice(ct_cpms[ct][Types.OPT], k)
    
    z_p_cmps.append(pno12)
    z_p_cmps.append(col)
    z_p_cmps.append(uph)
    z_p_cmps += opts.tolist()
    z_s = np.zeros((len(css[ct].i2c)))
    z_s[css[ct].c2i[z_p_cmps]] = 1
        
    w = np.sum(ws[ct][np.argwhere(z_s == 1)])
    
    z_ryps_ss.append((ct, w))
    z_ryps.append(z_p)

In [None]:
z_ryps = np.array(z_ryps)
z_ryps.shape

Test result <b>without solver</b>

In [None]:
np.mean((z_ys - z_ryps)**2)

### Test RCP with solver

In [None]:
sample_size = 100
y_preds = []
not_satisfied = []
for i in tqdm_notebook(range(len(z_ryps_ss[:sample_size]))):
    ct, w = z_ryps_ss[i]
    n, ct_i2c, A_ubs, b_ubs, A_eqs, b_eqs = get_solve_data(ct, css)
    p, scmps, x, d = solve(n, w, ws, ct, A_ubs, b_ubs, A_eqs, b_eqs, ct_i2c)
    
    is_sat = is_satisfied(x, A_ubs, b_ubs, A_eqs, b_eqs)
    if not is_sat:
        print("Item {} was not satisfied".format(i))
    
    not_satisfied.append((i, is_sat))
    
    cmps = [(x[1], x[2]) for x in solver_comps_to_comps(scmps)]
    y_pred = np.zeros((len(i2c)))
    y_pred[c2i[cmps]] = 1
    
    y_preds.append(y_pred)

In [None]:
ns = [x[1] for x in not_satisfied]
np.mean(((z_ys[:sample_size].T*ns).T - (np.array(y_preds).T*ns).T)**2)

## Dataset 2: component distribution

In [None]:
import pandas

df_train = pickle.load(open("data/training_test_data/train.pickle", 'rb'))
df_test = pickle.load(open("data/training_test_data/test.pickle", 'rb'))

In [None]:
columns = ['TYPECODE', 'ENGINECODE', 'SALESVERSIONCODE', 'COLOUR', 'UPHOLSTERY',
       'GEARBOXCODE', 'OPT_CODES','MODEL_YEAR', 'ENGINECODE_dist',
       'TYPECODE_dist', 'SALESVERSIONCODE_dist', 'GEARBOXCODE_dist',
       'COLOUR_dist', 'UPHOLSTERY_dist', 'OPT_CODES_dist', 'WORKING_PNO34',
       'ENGINECODE_translated', 'TYPECODE_translated',
       'SALESVERSIONCODE_translated', 'GEARBOXCODE_translated',
       'COLOUR_translated', 'UPHOLSTERY_translated', 'OPT_CODES_translated']

In [None]:
c2d = {}
m_types = [Types.ENGINE, Types.SALES_VERSION, Types.GEARBOX, Types.COL, Types.UPH]
for r in df_train[columns].values:
    #my = r[7]
    ct, ct_d = r[17], r[9]
    c2d[(ct,Types.CAR_TYPE,ct)] = ct_d

    for (t,c),d in zip(list(zip(m_types, r[[16,18,19,20,21]])), r[[8,10,11,12,13]]):
        cmp = (ct,t,c)
        if not cmp in c2d:
            c2d[cmp] = []
        
        c2d[cmp].append(d)
    
    for c,d in zip(r[6], r[14]):
        cmp = (ct, Types.OPT, c)
        if not cmp in c2d:
            c2d[cmp] = []

        c2d[cmp].append(d)

In [None]:
e = {}
for cmps in df_train[['ENGINECODE_translated', 'TYPECODE_translated',
       'SALESVERSIONCODE_translated', 'GEARBOXCODE_translated',
       'COLOUR_translated', 'UPHOLSTERY_translated', 'OPT_CODES_translated']].values:
    u_cmps = tuple(cmps[:-1].tolist() + list(cmps[-1]))
    if not u_cmps in e:
        e[u_cmps] = 0
    
    e[u_cmps] += 1
    
len(e.keys())

In [None]:
for k, ds in c2d.items():
    c2d[k] = np.mean(ds)

In [None]:
context_columns = [
     'antal_inpendlare',
     'antal_utpendlare',
     'förvärvs-arbetande',
     'ej_förvärvs-arbetande',
     'äganderätt/småhus',
     'bostadsrätt',
     'hyresrätt',
     'övriga_inkl._uppgift_saknas',
     'förgymnasial',
     'gymnasial',
     'eftergymnasial_mindre_än_3_år',
     'eftergymnasial_3_år_eller_längre_inkl._forskarutbildning',
     '0-6_år',
     '7-15_år',
     '16-19_år',
     '20-24_år',
     '25-44_år',
     '45-64_år',
     '65-w_år',
     'låg_inkomst',
     'medellåg_inkomst',
     'medelhög_inkomst',
     'hög_inkomst',
     'medianinkomst',
     'sammanboende_med_barn',
     'sammanboende_utan_barn',
     'ensamstående_med_barn',
     'ensamstående_utan_barn',
     'övriga_hushåll',
     'låg_köpkraft',
     'medellåg_köpkraft',
     'medelhög_köpkraft',
     'hög_köpkraft',
     'median_köpkraft',
     'jordbruk,_skogsbruk,_jakt_och_fiske',
     'tillverkning_och_utvinning',
     'energi_och_miljöverksamhet',
     'byggverksamhet',
     'handel',
     'transport_och_magasinering',
     'hotell-_och_restaurangverksamhet',
     'information_och_kommunikation',
     'finans-_och_försäkringsverksamhet',
     'fastighetsverksamhet',
     'företagstjänster',
     'offentlig_förvaltning_och_försvar',
     'utbildning',
     'vård_och_omsorg,_sociala_tjänster',
     'kulturella_och_personliga_tjänster_m.m.',
     'okänd_verksamhet',
     '0_barn',
     '1_barn',
     '2_barn',
     '3+_barn'
]


In [None]:
ctxs_train = df_train[context_columns].values
cars_train = df_train[['TYPECODE_translated', 'ENGINECODE_translated', 'SALESVERSIONCODE_translated', 'GEARBOXCODE_translated','COLOUR_translated', 'UPHOLSTERY_translated', 'OPT_CODES_translated']].values

ctxs_test = df_test[context_columns].values
cars_test = df_test[['TYPECODE_translated', 'ENGINECODE_translated', 'SALESVERSIONCODE_translated', 'GEARBOXCODE_translated','COLOUR_translated', 'UPHOLSTERY_translated', 'OPT_CODES_translated']].values

xy_ct_train here will be used in the first approximation function to train and predict a car type given the
context. In the second approximation function we get the context <i>and</i> the car type as input and predict a point
in the component distribution space.

In [None]:
def build_datasets(ctxs, cars):
    X_train, xy_ct_train, y_p_train = [],[],[]
    for ctx,car in zip(ctxs, cars):
        z = []
        ct,en,sv,gb,col,uph,opts = car
        z.append(c2d[(ct,Types.CAR_TYPE,ct)])

        for t,c in zip(m_types, [en,sv,gb,col,uph]):
            z.append(c2d[(ct,t,c)])

        ct_z = np.zeros((len(ct2i)))
        ct_idx = ct2i[ct]
        ct_z[ct_idx] = 1

        X_train.append(ctx)
        xy_ct_train.append(ct_z)
        
        z_norm = np.linalg.norm(z)
        y_p_train.append(z_norm)

    return np.array(X_train), np.array(xy_ct_train), np.array(y_p_train)

In [None]:
X_train, xy_ct_train, yd_p_train = build_datasets(ctxs_train, cars_train)
X_train.shape, xy_ct_train.shape, y_p_train.shape

In [None]:
np.mean(yd_p_train), np.var(yd_p_train), np.min(yd_p_train), np.max(yd_p_train)

In [None]:
kdeplot(yd_p_train)

In [None]:
dct_model = get_car_type_model(X_train.shape[1], xy_ct_train.shape[1])

In [None]:
dp_model = get_point_model(xy_ct_train.shape[1], X_train.shape[1], 1, loss=euclidean_distance_loss)

In [None]:
dct_history = dct_model.fit(X_train, xy_ct_train, epochs=1000000, callbacks=[
    EarlyStopping(monitor="loss", patience=5),
    ReduceLROnPlateau(monitor='loss',verbose=1)])

In [None]:
dp_history = dp_model.fit([xy_ct_train, X_train], y_p_train, epochs=1000000, callbacks=[
    EarlyStopping(monitor="loss", patience=5),
    ReduceLROnPlateau(monitor='loss',verbose=1)])

In [None]:
np.mean(y_p_train), np.var(y_p_train)

### Build ws matrix
Since we need weights to the solver in solver-component format, we iterate through all the components in the css and find the corresponding weight in the c2d dictionary. However, there might be components in css but not in c2d (the component has not been sold), then we just set the $d$ to 0.

In [None]:
ws_c2d = MDict()

typs = [Types.CAR_TYPE, Types.ENGINE, Types.SALES_VERSION, Types.GEARBOX]
for ct in css.keys():
    ws_c2d[ct] = []
    for cs in css[ct].c2i.keys():
        _, t, cmps = cs.split('__')
        if t == Types.PNO12:
            w = []
            ct,en,sv,gb = cmps.split('_')
            for (t,c) in zip(typs, [ct,en,sv,gb]):
                if not (ct,t,c) in c2d:
                    w.append(0)
                else:
                    w.append(c2d[(ct,t,c)])
                
            w_norm = np.linalg.norm(w)
            ws_c2d[ct].append(w_norm)
        else:
            
            _,ct,_,_,_,c = cmps[:4], cmps[4:7], cmps[7:9], cmps[9:11], cmps[11:12], cmps[12:]
            
            if not (ct, t, c) in c2d:
                w = 0
            else:
                w = c2d[(ct, t, c)]

            ws_c2d[ct].append(w)
            
    ws_c2d[ct] = np.array(ws_c2d[ct])

In [None]:
ws_c2d['225']

### Test 2

In [None]:
def car2idxs_dist(car):
    z = np.zeros((len(c2i)))
    idxs = c2i[list(zip([Types.CAR_TYPE, Types.ENGINE, Types.SALES_VERSION, Types.GEARBOX, Types.COL, Types.UPH], car[:-1]))]
    idxs += c2i[[(Types.OPT, c) for c in car[-1]]]
    z[idxs] = 1
    
    return z

In [None]:
test(ctxs_test, cars_test, ws_c2d, dct_model, dp_model, car2idxs_dist, n_samples=50)

In [None]:
def get_dist(dataset, get_key = lambda k: k):
    dist = {}
    for c in dataset:
        if not get_key(c) in dist:
            dist[get_key(c)] = 0

        dist[get_key(c)] += 1
        
    return dist

In [None]:
len(get_dist(y_p_train)),len(get_dist(yd_p_train))

In [None]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
ros = RandomOverSampler()

types_df = ['TYPECODE']
for key in types_df:
    if 'OPT' in key:
        continue
    X_res, y_res = ros.fit_resample(df_train, df_train[f'{key}_translated'])
    resampled = pd.DataFrame(X_res,columns=df_train.columns)

In [None]:
resampled['TYPECODE_translated'].value_counts()

## Random using component distribution data

In [None]:
nD = []
for cmps in cars_train:
    n = len(cmps[6])
    nD.append(n)
nD = np.array(nD)

In [None]:
l, h, mu, sd = np.min(nD), np.max(nD), np.mean(nD), np.var(nD)
l, h, mu, sd

In [None]:
opt_dist = truncnorm((l - mu) / sd, (h - mu) / sd, loc=mu, scale=sd)

In [None]:
z_ryps = []
z_ryps_ss = []
w_vecs = []
m_types = [Types.COL, Types.UPH]

for i in range(len(y_test_cmps)):
    z_p_cmps = []
    
    ct = np.random.choice(cts)
    pno12 = np.random.choice(pno12s[ct])
    col = np.random.choice(ct_cpms[ct][Types.COL])
    uph = np.random.choice(ct_cpms[ct][Types.UPH])
    
    k = int(round(opt_dist.rvs()))
    opts = np.random.choice(ct_cpms[ct][Types.OPT], k)
    
    z_p_cmps.append(pno12)
    z_p_cmps.append(col)
    z_p_cmps.append(uph)
    z_p_cmps += opts.tolist()
    
    z_s = np.zeros((len(css[ct].i2c)))
    z_s[css[ct].c2i[z_p_cmps]] = 1
        
    w = np.linalg.norm(ws_c2d[ct][np.argwhere(z_s == 1)])
    w_vecs.append((ct,w))
    #z_ryps_ss.append((ct, w))
    z_ryps.append(z_s)

In [None]:
pairs = list(zip(ctxs_test, cars_test))
np.random.shuffle(pairs)
ct_tst, cs_tst = zip(*pairs)

test(ct_tst, cs_tst, ws_c2d, None, None, map_car2vec=car2idxs_dist, n_samples=10, ct_vecs=w_vecs)

The loss 0.024.. is a new result but not in paper. This is because of a results deadline.