In [1]:

#more description: https://github.com/dakshmittal30/Uncertainty-Quantification/blob/main/gen_selection_bias_datasets.ipynb
import numpy as np
from csv import writer
from scipy import stats
import pandas as pd

directory = '/shared/share_mala/yuanzhe/adaptive_sampling/pipeline_datasets/'


def get_feat_importance(model, X_col): # get most important features, input is a randomforest classifier/regressor 
    importances = model.feature_importances_
    forest_importances = pd.Series(importances, index = X_col)
    forest_importances = forest_importances.nlargest(10)  #only keep most important 10 feature
    feature_importances = list(forest_importances.index) 
    return feature_importances #return the columns


def generate_prop_score(coeff, X_data, k):
    prop_score = X_data @ coeff  #X_data dim = (n*N), coeff dim = (N*1)
    prop_score = np.array([x - np.mean(prop_score) for x in prop_score]) #normalize the prop score
    temp = [-k + 2*k * np.mean(prop_score <= x) for x in prop_score]
    prop_score = [np.exp(x)/(1+np.exp(x)) for x in temp]

    return prop_score



def generate_selection_bias_random(data, data_name, X_col, seed,k, coeff = [],extra_text=''):
    '''
    data index has to be removed: i.e., 0,1,2,3,..
    input: data (dataframe, contains X + Y), X_col list [assume only make prop score based on a selected X_col],  seed and data_name (data_key)
    if want to include intercept term in the logistic model, simply add a constant in the data with X_col adding 'constant'
    '''
    np.random.seed(seed)



    # filter out features with low variability
    X_col_new = [] #only keep features that are not too concentrated
    for i in X_col:
        if np.std(data[i]) >= 0.001:
            X_col_new.append(i)

    X_col = X_col_new.copy()

    # standardizes selected features in the input data
    X_data = data[X_col]
    X_data = stats.zscore(X_data)

    N = len(X_col) #dim of X, which is the dim of prop score coeff

    # generate random coefficients if coeff is empty
    if coeff == []: #if coeff empty, we will generate one
        coeff = np.random.uniform(-1,1,N) #random coeff, it is now Uni[-1,1] but we can try fancier version later

    # generating propensity scores for each data
    prop_score = generate_prop_score(coeff, X_data, k )

    # converting propensity scores to binary values using binomial distribution
    if_sampled = np.array([np.random.binomial(size=1, n=1, p= q )[0] for q in prop_score]) #transform prop score to a Bernoulli rv

    # selects rows where binary values are non-zero, introducing selection bias
    selected_row = list(np.nonzero(if_sampled)[0]) #the index of selected rows, will only keep these rows

    # saving 2 csv files: one with selected rows and one with non-selected rows
    df = data[data.index.isin(selected_row)]
    df.to_csv(f"{directory}/biased_new/{data_name}_random_prop_score_selected_{str(seed)}_{str(k)}_{extra_text}_.csv", index=False)
    #save dataframe

    df_not = data[~data.index.isin(selected_row)]
    df_not.to_csv(f"{directory}/biased_new/{data_name}_random_prop_score_not_selected_{str(seed)}_{str(k)}_{extra_text}_.csv", index=False)

    #the following records our random prop score coeff
    with open(directory + 'summaries/summary_selection_bias.csv', 'a') as f_object:
        writer_object = writer(f_object)
        row = ['seed',seed,'data_name',data_name,'seed',seed,'X_col',X_col,'random_coeff',coeff,'k',k,'before_shape',data.shape,'after_shape',df.shape]
        writer_object.writerow(row)

In [2]:
train_csv = 'input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000.csv'
data_name = 'input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000'
df = pd.read_csv(directory + train_csv)
X_col = ['Column0']

In [3]:
 for k_val in [0.05,0.25,0.5,1.0,2.0,4.0,8.0,16.0]:
    generate_selection_bias_random(data = df, data_name = data_name, X_col = X_col, seed = 2,k = k_val, coeff = [],extra_text='')

In [5]:
df_2 = pd.read_csv(directory + '/biased_new/input_dim_1_train_init_data_mean_0.0ln_1.0sig_0.1no.2000_random_prop_score_selected_2_16.0__.csv')
df_2
print(np.mean(df['Column0']))
print(np.mean(df_2['Column0']))
print(np.mean(df['EVENT_LABEL']))
print(np.mean(df_2['EVENT_LABEL']))


-0.09563386725780001
-0.9236370532151394
-0.0058072998057590005
-0.00718789758756773
