In [12]:
import quapy as qp

#configuration
seed = 2032

#mean used to generate the covariates in test
test_mus=[-1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
test_std = 2

train_mu = 1
train_std = 2 #mu and std to describe how the train set is generated
mu_neg = 0
std_neg = 0.5 #mu and std describing negative examples
mu_pos = 2 
std_pos = 0.5 #mu and std describing positive examples

error_function = qp.error.mae

ntrain = 500 #number of examples in each training bag
ntest = 500 #number of examples in each test bag
nreps = 10 #number of repetitions of the whole experiment
n_test_samples = 50 #number of test samples with each covariate shift

In [13]:
import math

def compute_probabilities(x, mu1, std1, mu2, std2):
    probs = np.zeros(len(x))
    den1 = 1.0 / (std1 * math.sqrt(2 * math.pi))
    den2 = 1.0 / (std2 * math.sqrt(2 * math.pi))
    for i in range(len(x)):
        pdf1 = den1 * math.exp(-(x[i] - mu1) ** 2 / (2 * std1 ** 2))
        pdf2 = den2 * math.exp(-(x[i] - mu2) ** 2 / (2 * std2 ** 2))
        probs[i] = 1 - ( (pdf1 / pdf2) / (1 + pdf1 / pdf2) )

    return probs


def generate_dataset_covariate_shift(rng, n_examples, x_mu, x_std, mu1, std1, mu2, std2):
    """Returns a dataset generated with a normal distribution x_mu x_std
    labels are computed following two distributions one for negatives N(mu1,std1) and
    N(mu2,std2) for positives
    """
    x = x_std * rng.randn(n_examples, 1) + x_mu
    probs = compute_probabilities(x, mu1, std1, mu2, std2)
    coins = rng.rand(n_examples)
    y = np.zeros(n_examples, dtype=int)
    y[np.where(coins < probs)] = 1
    return x, y

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
import numpy as np



rng = np.random.RandomState(seed)

quant_methods = {
    "CC":qp.method.aggregative.CC(LogisticRegression(max_iter=1000)),
    "PCC":qp.method.aggregative.PCC(LogisticRegression(max_iter=1000)),
    "ACC":qp.method.aggregative.ACC(LogisticRegression(max_iter=1000), val_split=5, n_jobs=-1),
    "PACC":qp.method.aggregative.PACC(LogisticRegression(max_iter=1000), val_split=5, n_jobs=-1),
    "HDy":qp.method.aggregative.HDy(LogisticRegression(max_iter=1000)),
    "EMQ":qp.method.aggregative.EMQ(CalibratedClassifierCV(LogisticRegression(max_iter=1000),n_jobs=-1)),
    "MLPE":qp.method.non_aggregative.MaximumLikelihoodPrevalenceEstimation()
}


In [15]:
from quapy.data.base import LabelledCollection
import pandas as pd
from datetime import datetime

experiment_results={}
for method_name in quant_methods.keys():
    experiment_results[method_name] = pd.DataFrame(columns=["train_rep","test_sample","test_mu,","p_train","p_test","error"])

for i, test_mu in enumerate(test_mus):
    print('#Test mu=', test_mu, 'Rep#', end=' ')
    for rep in range(nreps):
        print(rep+1, end=' ')
        x_train, y_train = generate_dataset_covariate_shift(rng, ntrain, train_mu, train_std, mu_pos, std_pos, mu_neg, std_neg)
        train = LabelledCollection(x_train,y_train)
        for quantifier in quant_methods.values():
            quantifier.fit(train)
        for n_test_sample in range(n_test_samples):
            x_test, y_test = generate_dataset_covariate_shift(rng, ntest, test_mu, test_std, mu_pos, std_pos, mu_neg, std_neg)
            test = LabelledCollection(x_test,y_test)
            for quant_name, quantifier in quant_methods.items():
                preds = quantifier.quantify(test.X)
                true = test.prevalence()
                error = error_function(true,preds)
                experiment_results[quant_name] = experiment_results[quant_name].append([{'test_mu':test_mu,
                                                        'p_train':train.prevalence()[1],
                                                        'train_rep':rep,
                                                        'test_sample':n_test_sample,
                                                        'p_test':test.prevalence()[1],
                                                        'error':error}],ignore_index=True)
    print("")


#Test mu= -1.5 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= -1 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= -0.5 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 0 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 0.5 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 1 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 1.5 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 2 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 2.5 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 3 Rep# 1 2 3 4 5 6 7 8 9 10 
#Test mu= 3.5 Rep# 1 2 3 4 5 6 7 8 9 10 


In [16]:
for quant_name, quantifier in quant_methods.items():
    #add date to file name
    date_string = f'{datetime.now():%Y_%m_%d_%H_%M}'
    #save pandas dataframe
    experiment_results[quant_name].to_csv("results/covariate/results_%s_%s.csv" % (date_string,quant_name))