In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
from tqdm import tqdm
import seaborn as sns

In [None]:
file_name = "../results/experiment_1/trained_models.csv"
loss_file_name = "../results/experiment_1/losses.csv"
df = pd.read_csv(file_name, index_col=0)
models = ['logistic_regression_lt_is', 'neural_network_lt_is',
       'random_forest_lt_is', 'support_vector_machine_lt_is', 'xgboost_lt_is',
       'logistic_regression_lt_th', 'neural_network_lt_th',
       'random_forest_lt_th', 'support_vector_machine_lt_th', 'xgboost_lt_th',
       'logistic_regression_sf_is', 'neural_network_sf_is',
       'random_forest_sf_is', 'support_vector_machine_sf_is', 'xgboost_sf_is',
       'logistic_regression_sf_th', 'neural_network_sf_th',
       'random_forest_sf_th', 'support_vector_machine_sf_th', 'xgboost_sf_th',
       'logistic_regression_st_is', 'neural_network_st_is',
       'random_forest_st_is', 'support_vector_machine_st_is', 'xgboost_st_is',
       'logistic_regression_st_th', 'neural_network_st_th',
       'random_forest_st_th', 'support_vector_machine_st_th', 'xgboost_st_th']
models = sorted(models)

In [None]:
def c10(amount):
    return float(amount)

def c01():
    return 100.0

def single_loss(true_label, predicted_label, amount):
    assert((predicted_label) == 0 or predicted_label==1), print(predicted_label)
    if abs(true_label - predicted_label)<1e-6:
        return 0.0
    if true_label == 1:
        return c10(amount)
    return c01()

def f_losses(true_label, predictions, weights, amount, sogliazza=False):
    l = []
    for p in predictions:
        l.append(single_loss(true_label, p, amount))
    if not sogliazza:
        l.append(np.sum(weights*l))
    else:
        w_pred = np.sum(predictions*weights)
        l.append(single_loss(true_label, w_pred>0.5, amount))
    return l

def grad_loss(true_label, predictions, amount):
    return (c01()*(1-true_label)-c10(amount)*true_label)*predictions

In [None]:
class MWU:
    def __init__(self, eta, n):
        self.w = np.ones(n)/n
        self.eta = eta
        self.cumLoss = np.zeros(n)
    
    def predict(self, predictions):
        return np.sum(self.w*predictions)
    
    def update(self, grad_loss):
        self.cumLoss += grad_loss
        if self.eta>=10:
            i = np.argmax(-self.cumLoss)
            self.w = np.zeros_like(self.w)
            self.w[i] = 0
        else:
            self.w = self.w*np.exp(-self.eta*grad_loss)
            self.w /= np.sum(self.w)

In [None]:
N = len(df)
M = len(models)
Weights = np.ones((N, M))/M
losses = {k:[0] for k in models}
lr = 1e-4
L = [0]
preds = []
lr = 1/np.sqrt(N)
Linf = 6000
lr = np.sqrt(2*np.log(M)/(Linf*N))
alg = MWU(lr, M)
# alg = MWU(0, M)

In [None]:
for idx in tqdm(range(N)):
    trans = df.iloc[idx]
    true_label = trans["true"]
    amount = trans["Amount"]
    predictions = np.array([trans[m] for m in models])
    
    model_prediction = alg.predict(predictions)
    loss_vector = f_losses(true_label, predictions, alg.w, amount, sogliazza=False)
    L.append(loss_vector[-1])
    preds.append(model_prediction)
    Weights[idx] = alg.w
    
    for i,m in enumerate(models):
        #inst_loss_m = loss(true_label, trans[m], amount)
        losses[m].append(loss_vector[i])
        
    grad = grad_loss(true_label, predictions, amount)
    alg.update(grad)
    """
    if idx>N//2 and true_label != model_prediction:
        print(loss(true_label, model_prediction, amount))
        print(loss(true_label, trans[m], amount))
        print(model_prediction, trans[m])
        print()
    """
for m in models:
    losses[m] = np.cumsum(losses[m])
L = np.cumsum(L)

In [None]:
df["majority"] = df.apply(lambda row: 1 if sum(row[models]) > len(models)/2 else 0, axis=1)
loss_majority = []

for index, row in tqdm(df.iterrows()):
    loss_majority.append(single_loss(row["true"], row["majority"], row["Amount"]))

loss_majority = np.cumsum(loss_majority)

In [None]:
loss_matrix = np.array([losses[m] for m in models])
mean_loss = loss_matrix.mean(axis=0)
print(mean_loss.shape)

In [None]:
L = L[1:]

In [None]:
mean_loss = mean_loss[1:]

In [None]:
for k in losses:
    losses[k] = losses[k][1:]


In [None]:
losses["MWU"] = L
losses["MAJORITY"] = loss_majority
losses["MEAN"] = mean_loss

In [None]:
loss_df = pd.DataFrame(losses)
loss_df.describe()

In [None]:
loss_df.to_csv(loss_file_name)

In [None]:
TOT = len(L)
#TOT = 50000

T = np.arange(TOT)
idx = np.arange(1, TOT, TOT//2001)

In [None]:
plt.figure()

color_names = ['Purples', 'Blues', "Reds", 'Greens', 'Oranges']
cmaps = [plt.get_cmap(c) for c in color_names]

colors = [cmap(np.linspace(0.3, 1.0, 6)) for cmap in cmaps]
colors = [c for cmap in colors for c in cmap]

for i,m in enumerate(sorted(models)):
    plt.plot(T[idx], losses[m][idx], label=m, color=colors[i])
plt.plot(T[idx], L[idx], "*-", label="MWU", markevery=100)
#plt.ylim(0, 2*10**7)
plt.ylabel("$L_T$")
plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.grid()
plt.savefig(file_name+"perf.pdf", bbox_inches='tight')

In [None]:
plt.figure()
plt.stackplot(T[idx], (Weights[idx].T), colors = colors)
plt.savefig(file_name+"weights.pdf")

In [None]:
Weights

In [None]:
with open(file_name+"Weights.npy", "wb") as f:
    np.save(f, Weights)