In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from tqdm import tqdm

In [35]:
df = pd.read_csv("data/heloc/heloc_dataset_cleaned.csv")
X = df.drop(columns=['RiskPerformance'])
y = df['RiskPerformance']

In [36]:
def get_bootstrap_model(X_train, y_train, X_test, random_seed, bootstrap_size=0.5):
    np.random.seed(random_seed)
    idx = np.random.permutation(len(X_train))[:int(len(X_train)*0.5)]
    
    model = RandomForestClassifier(n_estimators=25, random_state=random_seed)
    model.fit(X_train[idx], y_train[idx])
    scores = model.predict_proba(X_test)[:, 1]
    return scores

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=24)
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()


predictions = []
columns = []
accuracy = []
for i in tqdm(range(1000)):
    columns.append(f'p_{i+1}')
    scores = get_bootstrap_model(X_train, y_train, X_test, random_seed=i, bootstrap_size=0.5)
    predictions.append(scores)
    accuracy.append(float((np.round(scores)==y_test).sum() / len(y_test)))

100%|███████████████████████████████████████████████████████| 1000/1000 [02:09<00:00,  7.75it/s]


In [41]:
print(np.mean(accuracy), np.var(accuracy))

0.7245411089866156 6.807422613324364e-05


In [42]:
predictions_df = pd.DataFrame(predictions).transpose()
predictions_df.columns=columns
predictions_df["y"] = y_test
predictions_df.to_csv("predictions/heloc/model_predictions.csv", index=False)