In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from tqdm import tqdm

In [29]:
num_random_seeds = 5
num_bootstrap_models = 1000

In [30]:
df = pd.read_csv("data/heloc/heloc_dataset_cleaned.csv")
X = df.drop(columns=['RiskPerformance'])
y = df['RiskPerformance']

In [31]:
def get_bootstrap_model(X_train, y_train, X_test, random_seed, bootstrap_size=0.5):
    np.random.seed(random_seed)
    idx = np.random.permutation(len(X_train))[:int(len(X_train)*0.5)]
    
    model = RandomForestClassifier(n_estimators=25, random_state=random_seed)
    model.fit(X_train[idx], y_train[idx])
    scores = model.predict_proba(X_test)[:, 1]
    return scores

In [32]:
output = []
for random_seed in range(num_random_seeds):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_seed)
    test_idx = y_test.index.to_numpy()
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()

    predictions = []
    columns = []
    accuracy = []
    for i in tqdm(range(num_bootstrap_models)):
        columns.append(f'p_{i+1}')
        scores = get_bootstrap_model(X_train, y_train, X_test, random_seed=i, bootstrap_size=0.5)
        predictions.append(scores)
        accuracy.append(float((np.round(scores)==y_test).sum() / len(y_test)))
    
    predictions_df = pd.DataFrame(predictions).transpose()
    predictions_df.columns=columns
    
    predictions_df["y"] = y_test
    predictions_df["idx"] = test_idx 
    predictions_df["seed"] = random_seed
    output.append(predictions_df)
output = pd.concat(output)

random seed 0


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.04it/s]


random seed 1


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.15it/s]


random seed 2


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.15it/s]


random seed 3


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  7.96it/s]


random seed 4


100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.02it/s]


In [33]:
output.to_csv("predictions/heloc/model_predictions.csv", index=False)