In [16]:
import os
import openml
import joblib
import shutil
import numpy as np
import pandas as pd
import seaborn as sb
from tqdm.notebook import tqdm
from copy import deepcopy as copy
from joblib import Parallel, delayed
from matplotlib import pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.preprocessing import RobustScaler as RS

openml.config.apikey = "4ef25cfe971a3731fddbe4fb2f6d1d98"
data_folder = "/data/pereirabarataap/journal/"
pd.set_option("max.columns", 1000)
pd.set_option("max.rows", 1000)

## Datasets

We use [openML](https://www.openml.org/) to get some (10) datasets for our experiments.

The datasets must respect the following:
* no missing values
* be between 20 to 100 features wide
* be between 100 and 5000 instances long
* classification task-related
    * 2 class-specific
    * at least 100 instances per class

In [81]:
datasets_df = pd.DataFrame.from_dict(openml.datasets.list_datasets()).T.dropna(how="any")

loc = (datasets_df["NumberOfMissingValues"]==0) & \
      (datasets_df["NumberOfClasses"]==2) & \
      (datasets_df["MinorityClassSize"]>=100) & \
      (datasets_df["NumberOfInstances"]<=5000) & \
      (datasets_df["NumberOfFeatures"]<=100) & \
      (datasets_df["NumberOfFeatures"]>=20) & \
      ~(datasets_df["name"].str.contains("_"))

dids = datasets_df.loc[loc].sort_values(by=["NumberOfInstances"])["did"]
dids.tolist()

for did in dids:
    dataset = openml.datasets.get_dataset(did)
    print(did)
    print(dataset.description)
    print("\n\n")

DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.


59
**Author**: Space Physics Group, Applied Physics Laboratory, Johns Hopkins University. Donated by Vince Sigillito.  
**Source**: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/ionosphere)  
**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) 

**Johns Hopkins University Ionosphere database**  
This radar data was collected by a system in Goose Bay, Labrador.  This system consists of a phased array of 16 high-frequency antennas with a total transmitted power on the order of 6.4 kilowatts.  See the paper for more details.  

### Attribute information
Received signals were processed using an autocorrelation function whose arguments are the time of a pulse and the pulse number.  There were 17 pulse numbers for the Goose Bay system.  Instances in this database are described by 2 attributes per pulse number, corresponding to the complex values returned by the function resulting from the complex electromagnetic signal.

The targets were fr

DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.


1494
**Author**: Kamel Mansouri, Tine Ringsted, Davide Ballabio  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation)  
**Please cite**: Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V. (2013). Quantitative Structure - Activity Relationship models for ready biodegradability of chemicals. Journal of Chemical Information and Modeling, 53, 867-878 


QSAR biodegradation Data Set 

* Abstract: 

Data set containing values for 41 attributes (molecular descriptors) used to classify 1055 chemicals into 2 classes (ready and not ready biodegradable).


* Source:

Kamel Mansouri, Tine Ringsted, Davide Ballabio (davide.ballabio '@' unimib.it), Roberto Todeschini, Viviana Consonni, Milano Chemometrics and QSAR Research Group (http://michem.disat.unimib.it/chm/), UniversitÃ  degli Studi Milano â€“ Bicocca, Milano (Italy)


* Data Set Information:

The QSAR biodegradation dataset was built in the Milano Chemometrics and QSAR Research Group (UniversitÃ

DEBUG:openml.datasets.dataset:Saved dataset 1067: kc1 to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/1067/dataset.pkl.py3


1067
**Author**: Mike Chapman, NASA  
**Source**: [tera-PROMISE](http://openscience.us/repo/defect/mccabehalsted/kc1.html) - 2004  
**Please cite**: Sayyad Shirabad, J. and Menzies, T.J. (2005) The PROMISE Repository of Software Engineering Databases. School of Information Technology and Engineering, University of Ottawa, Canada.  
  
**KC1 Software defect prediction**  
One of the NASA Metrics Data Program defect data sets. Data from software for storage management for receiving and processing ground data. Data comes from McCabe and Halstead features extractors of source code.  These features were defined in the 70s in an attempt to objectively characterize code features that are associated with software quality.

### Attribute Information  

1. loc             : numeric % McCabe's line count of code
2. v(g)            : numeric % McCabe "cyclomatic complexity"
3. ev(g)           : numeric % McCabe "essential complexity"
4. iv(g)           : numeric % McCabe "design complexity"
5. n  

DEBUG:openml.datasets.dataset:Saved dataset 958: segment to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/958/dataset.pkl.py3


958
**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

Binarized version of the original data set (see version 1). The multi-class target feature is converted to a two-class nominal target feature by re-labeling the majority class as positive ('P') and all others as negative ('N'). Originally converted by Quan Sun.





DEBUG:openml.datasets.dataset:Saved dataset 1487: ozone-level-8hr to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/1487/dataset.pkl.py3


1487
**Author**: Kun Zhang, Wei Fan, XiaoJing Yuan

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/ozone+level+detection)

**Please cite**:   

Forecasting skewed biased stochastic ozone days: analyses, solutions and beyond, Knowledge and Information Systems, Vol. 14, No. 3, 2008. 


1 . Abstract: 
Two ground ozone level data sets are included in this collection. One is the eight hour peak set (eighthr.data), the other is the one hour peak set (onehr.data). Those data were collected from 1998 to 2004 at the Houston, Galveston and Brazoria area.

2. Source:

Kun Zhang, zhang.kun05 '@' gmail.com, Department of Computer Science, Xavier University of Lousiana 
Wei Fan, wei.fan '@' gmail.com, IBM T.J.Watson Research 
XiaoJing Yuan, xyuan '@' uh.edu, Engineering Technology Department, College of Technology, University of Houston 


3. Data Set Information:

All the attribute start with T means the temperature measured at different time throughout the day; and those starts with WS 

DEBUG:openml.datasets.dataset:Saved dataset 953: splice to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/953/dataset.pkl.py3
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.


953
**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

Binarized version of the original data set (see version 1). The multi-class target feature is converted to a two-class nominal target feature by re-labeling the majority class as positive ('P') and all others as negative ('N'). Originally converted by Quan Sun.



3
**Author**: Alen Shapiro
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn))    
**Please cite**: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)  

1. Title: Chess End-Game -- King+Rook versus King+Pawn on a7
    (usually abbreviated KRKPA7).  The pawn on a7 means it is one square
    away from queening.  It is the King+Rook's side (white) to move.
 
 2. Sources:
     (a) Database originally generated and described by Alen Shapiro.
     (b) Donor/Coder: Rob Holte (holte@uottawa.bitnet).  The database
         was supplied to Holte by Peter Clark of the Turing Institute
         in G

DEBUG:openml.datasets.dataset:Saved dataset 41156: ada to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/41156/dataset.pkl.py3


41156
The goal of this challenge is to expose the research community to real world datasets of interest to 4Paradigm. All datasets are formatted in a uniform way, though the type of data might differ. The data are provided as preprocessed matrices, so that participants can focus on classification, although participants are welcome to use additional feature extraction procedures (as long as they do not violate any rule of the challenge). All problems are binary classification problems and are assessed with the normalized Area Under the ROC Curve (AUC) metric (i.e. 2*AUC-1).
                   The identity of the datasets and the type of data is concealed, though its structure is revealed. The final score in  phase 2 will be the average of rankings  on all testing datasets, a ranking will be generated from such results, and winners will be determined according to such ranking.
                   The tasks are constrained by a time budget. The Codalab platform provides computational resou

DEBUG:openml.datasets.dataset:Saved dataset 44: spambase to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/44/dataset.pkl.py3


44
**Author**: Mark Hopkins, Erik Reeber, George Forman, Jaap Suermondt    
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/spambase)   
**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)

SPAM E-mail Database  
The "spam" concept is diverse: advertisements for products/websites, make money fast schemes, chain letters, pornography... Our collection of spam e-mails came from our postmaster and individuals who had filed spam.  Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word 'george' and the area code '650' are indicators of non-spam.  These are useful when constructing a personalized spam filter.  One would either have to blind such non-spam indicators or get a very wide collection of non-spam to generate a general purpose spam filter.
 
For background on spam:  
Cranor, Lorrie F., LaMacchia, Brian A.  Spam! Communications of the ACM, 41(8):74-83, 1998.  

### Attribute Information:  
The last column deno

DEBUG:openml.datasets.dataset:Saved dataset 40701: churn to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/40701/dataset.pkl.py3


40701
**Author**: Unknown  
**Source**: [PMLB](https://github.com/EpistasisLab/penn-ml-benchmarks/tree/master/datasets/classification), [BigML](https://bigml.com/user/francisco/gallery/dataset/5163ad540c0b5e5b22000383), Supposedly from UCI but I can't find it there.  
**Please cite**:   

A dataset relating characteristics of telephony account features and usage and whether or not the customer churned. Originally used in [Discovering Knowledge in Data: An Introduction to Data Mining](http://secs.ac.in/wp-content/CSE_PORTAL/DataMining_Daniel.pdf).





DEBUG:openml.datasets.dataset:Saved dataset 979: waveform-5000 to file /home/pereirabarataap/.openml/cache/org/openml/www/datasets/979/dataset.pkl.py3


979
**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

Binarized version of the original data set (see version 1). The multi-class target feature is converted to a two-class nominal target feature by re-labeling the majority class as positive ('P') and all others as negative ('N'). Originally converted by Quan Sun.





In [94]:
# manual inspection yields the following 10 datasets
dids = [59, 1063, 1510, 40705, 31, 1494, 1504, 1487, 3, 44]
datasets_df.loc[dids].sort_values(by=["NumberOfInstances"]).drop(columns=["NumberOfInstancesWithMissingValues", "NumberOfMissingValues"])

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfSymbolicFeatures
59,59,ionosphere,1,1,active,ARFF,225,2,126,2,35,351,34,1
1063,1063,kc2,1,2,active,ARFF,415,2,107,2,22,522,21,1
1510,1510,wdbc,1,64,active,ARFF,357,2,212,2,31,569,30,1
40705,40705,tokyo1,1,869,active,ARFF,613,2,346,2,45,959,42,3
31,31,credit-g,1,1,active,ARFF,700,10,300,2,21,1000,7,14
1494,1494,qsar-biodeg,1,64,active,ARFF,699,2,356,2,42,1055,41,1
1504,1504,steel-plates-fault,1,64,active,ARFF,1268,2,673,2,34,1941,33,1
1487,1487,ozone-level-8hr,1,64,active,ARFF,2374,2,160,2,73,2534,72,1
3,3,kr-vs-kp,1,1,active,ARFF,1669,3,1527,2,37,3196,0,37
44,44,spambase,1,1,active,ARFF,2788,2,1813,2,58,4601,57,1


## Feature and Target Manipulation
Before testing how different methods perform in crosslier detection, we need to generate crossliers to be detected.<br>
The experimental setup can divided in:
* symmetric
    * both classes have the same proportion of crossliers
* asymmetric
    * the proportion of crossliers in both classes can be different

#### Symmetric
There are 2 parameters in this setup:
* <font size="4">$\rho_y$</font> $\in \{.01,.02,.03,.04,.05,.06,.07,.08,.09,.1\}$
* <font size="4">$\rho_x$</font> $\in \{0,  .05, .1,.15, .2, .25, .3, .35, .4, .45\}$

<font size="4">$\rho_y$</font> represents the proportion of instances with labels swapped in each class.<br>
<font size="4">$\rho_x$</font> represents the proportion of attributes of which the values are replaced in label-swapped instances.<br>

#### Asymmetric
There are 4 parameters in this setup:
* <font size="4">$\rho_{y+}$</font> $\in \{.01, .02,.03, .04,.05, .06,.07, .08,.09,  .1\}$
* <font size="4">$\rho_{y-}$</font> $\in \{.01, .02,.03, .04,.05, .06,.07, .08,.09,  .1\}$
* <font size="4">$\rho_{x+}$</font> $\in \{0,   .05, .1, .15, .2, .25, .3, .35, .4, .45\}$
* <font size="4">$\rho_{x-}$</font> $\in \{0,   .05, .1, .15, .2, .25, .3, .35, .4, .45\}$

<font size="4">$\rho_{y+}$</font> represents the proportion of instances with labels swapped: they originally belong to the positive class.<br>
<font size="4">$\rho_{y-}$</font> represents the proportion of instances with labels swapped: they originally belong to the negative class.<br>
<font size="4">$\rho_{x+}$</font> represents the proportion of attributes of which the values are replaced in label-swapped instances: the instances originally belong to the positive class.<br>
<font size="4">$\rho_{x-}$</font> represents the proportion of attributes of which the values are replaced in label-swapped instances: the instances originally belong to the negative class.<br>

The replacement simulates real-world fraud in which manipulation of feature values towards those of another label further masks the real label.<br>
Replacement values are drawn from <b>univariate</b> distributions with parameters estimated from the respective attributes belonging to the class being mimicked.<br>
To note, for both <i>symmetric</i> and <i>asymmetric</i> setups:
* Each dataset will have <b>50</b> random initialisations for each set of parameters
* Attributes to be replaced:
    * are chosen by either:
        * <b>random</b> selection, where attributes are selected randomly for each instance
        * <b>category</b> selection, where each class has a set of random attributes to mimic (1 set of attributes per class)
        * <b>consistent</b> selection, where a single set of random attributes are selected for all label-swapped instances regardless of class
    * are modelled as:
        * normal distributions $\mathcal{N}(\hat{\mu}, \hat{\sigma})$ for numerical attributes
        * multinomial distributions for categorical attributes
    * have distribution parameters either:
        * <b>clean</b>, estimated <i>a priori</i> label swaps
        * <b>noisy</b>, estimated <i>a posteriori</i> label swaps
    



In [93]:
def in_apply(y):
    # this function is required by XGBoost
    # since it cannot handle dummy_variables well
    # dummy names cannot contain "<" nor ","
    try:
        return (y.replace("<", "lt")).replace(",", "c")
    except:
        return y
    
def parallel_symmetric_setup(seed, did_folder, X, y, df, attribute_names, categorical_indicator, y_pos, y_neg, X_clean_pos, X_clean_neg):
    seed_folder = did_folder + "seed=" + str(seed) + "/"
    try:
        os.mkdir(seed_folder)
    except:
        shutil.rmtree(seed_folder)
        os.mkdir(seed_folder)
    np.random.seed(seed)
    for roh_y_prct in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        roh_y_folder = seed_folder + "roh_y=" + str(roh_y_prct) + "/"
        try:
            os.mkdir(roh_y_folder)
        except:
            shutil.rmtree(roh_y_folder)
            os.mkdir(roh_y_folder)
        roh_y = roh_y_prct/100
        # number of instances to label swap
        roh_y_pos_n = int(np.ceil(roh_y*X_clean_pos.shape[0]))
        roh_y_neg_n = int(np.ceil(roh_y*X_clean_neg.shape[0]))
        for roh_x_prct in [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]:
            roh_x_folder = roh_y_folder + "roh_x=" + str(roh_x_prct) + "/"
            try:
                os.mkdir(roh_x_folder)
            except:
                shutil.rmtree(roh_x_folder)
                os.mkdir(roh_x_folder)
            roh_x = roh_x_prct/100
            # number of attributes to swap
            roh_x_n = int(np.ceil(roh_x*X.shape[1]))
            # label-swap instance locations (index)
            roh_y_pos_loc = np.random.choice(X_clean_pos.index, roh_y_pos_n, replace=False) # will be present in X_noisy_neg
            roh_y_neg_loc = np.random.choice(X_clean_neg.index, roh_y_neg_n, replace=False) # will be present in X_noisy_pos    
            if roh_x_n == 0:
                # no attribute replacement is required, only label swap
                corrupted_df = copy(df)
                corrupted_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_df.loc[roh_y_neg_loc, "class"] = y_pos
                # saving corrupted df
                joblib.dump(corrupted_df, roh_x_folder + "corrupted_df.pkl")
                corrupted_df.to_csv(roh_x_folder + "corrupted_df.csv", index=False)
            else:
                X_noisy_pos = pd.concat((
                    copy(X.loc[roh_y_neg_loc]), # getting swap examples from other class
                    copy(X_clean_pos.drop(labels=roh_y_pos_loc)) # dropping examples that are given to other class
                ))
                X_noisy_neg = pd.concat((
                    copy(X.loc[roh_y_pos_loc]), # getting swap examples from other class
                    copy(X_clean_neg.drop(labels=roh_y_neg_loc)) # dropping examples that are given to other class
                ))
                # random selection
                # each label-swapped instance has its own set of random attributes to replace
                random_folder = roh_x_folder + "random/"
                try:
                    os.mkdir(random_folder)
                except:
                    shutil.rmtree(random_folder)
                    os.mkdir(random_folder)
                random_clean_X = copy(X)
                random_noisy_X = copy(X)
                for roh_y_pos_index in roh_y_pos_loc:
                    replace_attribute_indexs = np.random.choice(range(X.shape[1]), roh_x_n, replace=False)
                    replace_attribute_names = attribute_names[replace_attribute_indexs]
                    replace_categorical_indicators = categorical_indicator[replace_attribute_indexs]
                    # for each attribute being replaced
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_neg[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_neg[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_neg[replace_attribute_name])
                            clean_sig = np.std(X_clean_neg[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_neg[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_neg[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    random_clean_X.loc[roh_y_pos_index, replace_attribute_names] = clean_replacement_values
                    random_noisy_X.loc[roh_y_pos_index, replace_attribute_names] = noisy_replacement_values
                for roh_y_neg_index in roh_y_neg_loc:
                    replace_attribute_indexs = np.random.choice(range(X.shape[1]), roh_x_n, replace=False)
                    replace_attribute_names = attribute_names[replace_attribute_indexs]
                    replace_categorical_indicators = categorical_indicator[replace_attribute_indexs]
                    # for each attribute being replaced
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_pos[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_pos[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_pos[replace_attribute_name])
                            clean_sig = np.std(X_clean_pos[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_pos[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_pos[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    random_clean_X.loc[roh_y_neg_index, replace_attribute_names] = clean_replacement_values
                    random_noisy_X.loc[roh_y_neg_index, replace_attribute_names] = noisy_replacement_values
                corrupted_clean_df = pd.concat((copy(random_clean_X), copy(y)), axis=1)
                corrupted_clean_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_clean_df.loc[roh_y_neg_loc, "class"] = y_pos
                corrupted_noisy_df = pd.concat((copy(random_noisy_X), copy(y)), axis=1)
                corrupted_noisy_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_noisy_df.loc[roh_y_neg_loc, "class"] = y_pos
                joblib.dump(corrupted_clean_df, random_folder + "corrupted_clean_df.pkl")
                joblib.dump(corrupted_noisy_df, random_folder + "corrupted_noisy_df.pkl")
                corrupted_clean_df.to_csv(random_folder + "corrupted_clean_df.csv", index=False)
                corrupted_noisy_df.to_csv(random_folder + "corrupted_noisy_df.csv", index=False)

                # category selection
                # each class has a set of random attributes to mimic (1 set of attributes per class)
                category_folder = roh_x_folder + "category/"
                try:
                    os.mkdir(category_folder)
                except:
                    shutil.rmtree(category_folder)
                    os.mkdir(category_folder)
                category_clean_X = copy(X)
                category_noisy_X = copy(X)
                replace_attribute_indexs = np.random.choice(range(X.shape[1]), roh_x_n, replace=False)
                replace_attribute_names = attribute_names[replace_attribute_indexs]
                replace_categorical_indicators = categorical_indicator[replace_attribute_indexs]
                for roh_y_pos_index in roh_y_pos_loc:
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_neg[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_neg[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_neg[replace_attribute_name])
                            clean_sig = np.std(X_clean_neg[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_neg[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_neg[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    category_clean_X.loc[roh_y_pos_index, replace_attribute_names] = clean_replacement_values
                    category_noisy_X.loc[roh_y_pos_index, replace_attribute_names] = noisy_replacement_values                    
                replace_attribute_indexs = np.random.choice(range(X.shape[1]), roh_x_n, replace=False)
                replace_attribute_names = attribute_names[replace_attribute_indexs]
                replace_categorical_indicators = categorical_indicator[replace_attribute_indexs]
                for roh_y_neg_index in roh_y_neg_loc:
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_pos[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_pos[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_pos[replace_attribute_name])
                            clean_sig = np.std(X_clean_pos[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_pos[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_pos[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    category_clean_X.loc[roh_y_neg_index, replace_attribute_names] = clean_replacement_values
                    category_noisy_X.loc[roh_y_neg_index, replace_attribute_names] = noisy_replacement_values
                corrupted_clean_df = pd.concat((copy(category_clean_X), copy(y)), axis=1)
                corrupted_clean_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_clean_df.loc[roh_y_neg_loc, "class"] = y_pos
                corrupted_noisy_df = pd.concat((copy(category_noisy_X), copy(y)), axis=1)
                corrupted_noisy_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_noisy_df.loc[roh_y_neg_loc, "class"] = y_pos
                joblib.dump(corrupted_clean_df, category_folder + "corrupted_clean_df.pkl")
                joblib.dump(corrupted_noisy_df, category_folder + "corrupted_noisy_df.pkl")
                corrupted_clean_df.to_csv(category_folder + "corrupted_clean_df.csv", index=False)
                corrupted_noisy_df.to_csv(category_folder + "corrupted_noisy_df.csv", index=False)

                # consistent selection
                # same attributes are selected for every label-swapped instance regardless of class
                consistent_folder = roh_x_folder + "consistent/"
                try:
                    os.mkdir(consistent_folder)
                except:
                    shutil.rmtree(consistent_folder)
                    os.mkdir(consistent_folder)
                consistent_clean_X = copy(X)
                consistent_noisy_X = copy(X)
                replace_attribute_indexs = np.random.choice(range(X.shape[1]), roh_x_n, replace=False)
                replace_attribute_names = attribute_names[replace_attribute_indexs]
                replace_categorical_indicators = categorical_indicator[replace_attribute_indexs]
                for roh_y_pos_index in roh_y_pos_loc:
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_neg[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_neg[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_neg[replace_attribute_name])
                            clean_sig = np.std(X_clean_neg[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_neg[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_neg[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    consistent_clean_X.loc[roh_y_pos_index, replace_attribute_names] = clean_replacement_values
                    consistent_noisy_X.loc[roh_y_pos_index, replace_attribute_names] = noisy_replacement_values                    
                for roh_y_neg_index in roh_y_neg_loc:
                    clean_replacement_values = []
                    noisy_replacement_values = []
                    for i in range(roh_x_n):
                        replace_attribute_name = replace_attribute_names[i]
                        replace_categorical_indicator = replace_categorical_indicators[i]
                        if replace_categorical_indicator:
                            # is categorical
                            clean_replacement_value = np.random.choice(X_clean_pos[replace_attribute_name], 1)
                            noisy_replacement_value = np.random.choice(X_noisy_pos[replace_attribute_name], 1)
                        else:
                            # is numerical
                            clean_mu = np.mean(X_clean_pos[replace_attribute_name])
                            clean_sig = np.std(X_clean_pos[replace_attribute_name], ddof=1)
                            clean_replacement_value = np.random.normal(clean_mu, clean_sig, 1)
                            noisy_mu = np.mean(X_noisy_pos[replace_attribute_name])
                            noisy_sig = np.std(X_noisy_pos[replace_attribute_name], ddof=1)
                            noisy_replacement_value = np.random.normal(noisy_mu, noisy_sig, 1)
                        clean_replacement_values += [clean_replacement_value[0]]
                        noisy_replacement_values += [noisy_replacement_value[0]]
                    consistent_clean_X.loc[roh_y_neg_index, replace_attribute_names] = clean_replacement_values
                    consistent_noisy_X.loc[roh_y_neg_index, replace_attribute_names] = noisy_replacement_values
                corrupted_clean_df = pd.concat((copy(consistent_clean_X), copy(y)), axis=1)
                corrupted_clean_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_clean_df.loc[roh_y_neg_loc, "class"] = y_pos
                corrupted_noisy_df = pd.concat((copy(consistent_noisy_X), copy(y)), axis=1)
                corrupted_noisy_df.loc[roh_y_pos_loc, "class"] = y_neg
                corrupted_noisy_df.loc[roh_y_neg_loc, "class"] = y_pos
                joblib.dump(corrupted_clean_df, consistent_folder + "corrupted_clean_df.pkl")
                joblib.dump(corrupted_noisy_df, consistent_folder + "corrupted_noisy_df.pkl")
                corrupted_clean_df.to_csv(consistent_folder + "corrupted_clean_df.csv", index=False)
                corrupted_noisy_df.to_csv(consistent_folder + "corrupted_noisy_df.csv", index=False)

In [104]:
# SYMETRIC
# SETUP
data_folder = "/data/pereirabarataap/journal/symmetric/dids/"
try:
    os.mkdir(data_folder)
except:
    shutil.rmtree(data_folder)
    os.mkdir(data_folder)
seeds = range(50)
for did in tqdm(dids):
    did_folder = data_folder + "did=" + str(did) + "/"
    try:
        os.mkdir(did_folder)
    except:
        shutil.rmtree(did_folder)
        os.mkdir(did_folder)
    dataset = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format='dataframe',
        target=dataset.default_target_attribute
    )
    categorical_indicator = np.array(categorical_indicator)
    X = X.apply(lambda x: x.apply(lambda y: in_apply(y)))
    attribute_names = np.array(range(X.shape[1])).astype(str)
    X.columns = attribute_names
    df = pd.concat((copy(X), copy(y)), axis=1)
    df.columns = attribute_names.tolist() + ["class"]
    y = copy(df["class"])
    
    joblib.dump(df, did_folder+"df.pkl")
    df.to_csv(did_folder+"df.csv", index=False)
    
    y_pos, y_neg = y.unique()
    X_clean_pos = copy(X.loc[y==y_pos]) # clean positive attributes
    X_clean_neg = copy(X.loc[y==y_neg]) # clean negative attributes
    
    Parallel(n_jobs=len(seeds), backend="loky")(delayed(parallel_symmetric_setup)(
        seed=copy(seed),
        did_folder=copy(did_folder),
        X=copy(X),
        y=copy(y),
        df=copy(df),
        attribute_names=copy(attribute_names),
        categorical_indicator=copy(categorical_indicator),
        y_pos=copy(y_pos),
        y_neg=copy(y_neg),
        X_clean_pos=copy(X_clean_pos),
        X_clean_neg=copy(X_clean_neg),
    ) for seed in seeds)
    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.
DEBUG:openml.datasets.dataset:Data pickle file already exists and is up to date.



