# Generate all simulation data

In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd


np.random.seed(42)

### Classification1

In [2]:
def sigmoid(x):
    x *= 2
    return np.exp(x) / (np.exp(x) + np.exp(-x))

def permute(X, noisy_features):
    tmp = X.copy()
    for j in range(X.shape[1]):
        if noisy_features[j] == 1:
            tmp[:, j] = np.random.permutation(tmp[:, j])
    return tmp

def f(X, noisy_features):
    probs = sigmoid(np.mean(X[:, noisy_features == 0], 1))
    return np.array([np.random.choice([0, 1], 1, p=[1 - prob, prob]) for prob in probs]).flatten()

for ind in range(40):
    
    X_train = np.array(pd.read_csv('02_enhancer/X_train.csv', header=0, index_col=0))
    X_test = np.array(pd.read_csv('02_enhancer/X_test.csv', header=0, index_col=0))
    y_train = np.array(pd.read_csv('02_enhancer/y_train.csv', header=0, index_col=0))
    y_test = np.array(pd.read_csv('02_enhancer/y_test.csv', header=0, index_col=0))
    n, m = X_train.shape
    names = np.arange(m)

    n_features = X_train.shape[1]
    n, m = X_train.shape
    names = np.arange(m)
    noisy_features = np.ones((n_features, ), dtype=int)
    noisy_features[np.random.choice(range(n_features), 5, replace=False)] = 0
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train = permute(X_train, noisy_features)
    X_test = permute(X_test, noisy_features)
    y_train = f(X_train, noisy_features)
    y_test = f(X_test, noisy_features)

    np.savetxt('04_aggregate/classification1/permuted{}_X_train.csv'.format(ind), X_train, delimiter=',')
    np.savetxt('04_aggregate/classification1/permuted{}_y_train.csv'.format(ind), y_train, delimiter=',')
    np.savetxt('04_aggregate/classification1/permuted{}_X_test.csv'.format(ind), X_test, delimiter=',')
    np.savetxt('04_aggregate/classification1/permuted{}_y_test.csv'.format(ind), y_test, delimiter=',')
    np.savetxt('04_aggregate/classification1/permuted{}_noisy_features.csv'.format(ind), noisy_features, delimiter=',')

### Regression1

In [3]:
def f(X, noisy_features):
    y = np.mean(X[:, noisy_features == 0], 1) 
    y += np.random.normal(0, np.std(y) ** 2 * 100, y.shape)
    return y

for ind in range(40):
    # ### load data
    X_train = np.array(pd.read_csv('02_enhancer/X_train.csv', header=0, index_col=0))
    X_test = np.array(pd.read_csv('02_enhancer/X_test.csv', header=0, index_col=0))
    y_train = np.array(pd.read_csv('02_enhancer/y_train.csv', header=0, index_col=0))
    y_test = np.array(pd.read_csv('02_enhancer/y_test.csv', header=0, index_col=0))
    n, m = X_train.shape
    names = np.arange(m)

    n_features = X_train.shape[1]
    n, m = X_train.shape
    names = np.arange(m)
    scaler = MinMaxScaler()
    noisy_features = np.ones((n_features, ), dtype=int)
    noisy_features[np.random.choice(range(n_features), 5, replace=False)] = 0
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train = permute(X_train, noisy_features)
    X_test = permute(X_test, noisy_features)
    y_train = f(X_train, noisy_features)
    y_test = f(X_test, noisy_features)
    
    np.savetxt('04_aggregate/regression1/permuted{}_X_train.csv'.format(ind), X_train, delimiter=',')
    np.savetxt('04_aggregate/regression1/permuted{}_y_train.csv'.format(ind), y_train, delimiter=',')
    np.savetxt('04_aggregate/regression1/permuted{}_X_test.csv'.format(ind), X_test, delimiter=',')
    np.savetxt('04_aggregate/regression1/permuted{}_y_test.csv'.format(ind), y_test, delimiter=',')
    np.savetxt('04_aggregate/regression1/permuted{}_noisy_features.csv'.format(ind), noisy_features, delimiter=',')

### Classification 2

In [4]:
def sigmoid(x):
    x *= 1
    return np.exp(x) / (np.exp(x) + np.exp(-x))

def f(X, noisy_features):
    probs = sigmoid(np.mean(X[:, noisy_features == 0], 1))
    return np.array([np.random.choice([0, 1], 1, p=[1 - prob, prob]) for prob in probs]).flatten()

n_samples = 1000
n_features = 50

for ind in range(40):
    
    X_train = np.zeros((n_samples, n_features))
    X_test = np.zeros((n_samples, n_features))
    for i in range(n_features):
        X_train[:, i] = np.random.choice(list(range(i+2)), n_samples)
        X_test[:, i] = np.random.choice(list(range(i+2)), n_samples)
    
    noisy_features = np.ones((n_features, ), dtype=int)
    noisy_features[np.random.choice(range(n_features//5), 5, replace=False)] = 0
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = f(X_train, noisy_features)
    y_test = f(X_test, noisy_features)

    np.savetxt('04_aggregate/classification2/permuted{}_X_train.csv'.format(ind), X_train, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/classification2/permuted{}_y_train.csv'.format(ind), y_train, delimiter=',', fmt='%d')
    np.savetxt('04_aggregate/classification2/permuted{}_X_test.csv'.format(ind), X_test, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/classification2/permuted{}_y_test.csv'.format(ind), y_test, delimiter=',', fmt='%d')
    np.savetxt('04_aggregate/classification2/permuted{}_noisy_features.csv'.format(ind), noisy_features, delimiter=',', fmt='%d')

### Regression 2

In [5]:
def f(X, noisy_features):
    y = np.mean(X[:, noisy_features == 0], 1) 
    y += np.random.normal(0, np.std(y) ** 2 * 100, y.shape)
    return y

n_samples = 1000
n_features = 50

for ind in range(40):
    # ### load data
    X_train = np.zeros((n_samples, n_features))
    X_test = np.zeros((n_samples, n_features))
    for i in range(n_features):
        X_train[:, i] = np.random.choice(list(range(i+2)), n_samples)
        X_test[:, i] = np.random.choice(list(range(i+2)), n_samples)

    scaler = MinMaxScaler()
    noisy_features = np.ones((n_features, ), dtype=int)
    noisy_features[np.random.choice(range(n_features//5), 5, replace=False)] = 0
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train = permute(X_train, noisy_features)
    X_test = permute(X_test, noisy_features)
    y_train = f(X_train, noisy_features)
    y_test = f(X_test, noisy_features)
    
    np.savetxt('04_aggregate/regression2/permuted{}_X_train.csv'.format(ind), X_train, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/regression2/permuted{}_y_train.csv'.format(ind), y_train, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/regression2/permuted{}_X_test.csv'.format(ind), X_test, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/regression2/permuted{}_y_test.csv'.format(ind), y_test, delimiter=',', fmt='%10.5f')
    np.savetxt('04_aggregate/regression2/permuted{}_noisy_features.csv'.format(ind), noisy_features, delimiter=',', fmt='%d')