In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

warnings.filterwarnings("ignore")

### Datasets

#### Real (SAheart)

In [5]:
sa_data = pd.read_csv("data/SAheart.data")
sa_data = sa_data.drop("row.names", axis=1)
sa_data["famhist"] = sa_data["famhist"].map({"Present": 1, "Absent": 0})
y_real = sa_data["chd"]
x_real = sa_data.drop("chd", axis=1)

#### Artificial

In [75]:
def generate_data(n, b, k):
    # number of features
    k = k + 5
    beta = np.zeros(k)
    beta[:5] = b
    x = np.random.normal(size=(n, k))
    probability = 1 / (1 + np.exp(-np.dot(beta, x.T)))
    y = np.random.binomial(1, probability)
    return x, y

In [79]:
generate_data(n=5, b=2, k=1)

(array([[ 0.94128458, -0.65064257, -0.48712538, -0.59239392, -0.86399077,
          0.04852163],
        [-0.83095012,  0.27045683, -0.05023811, -0.23894805, -0.90756366,
         -0.57677133],
        [ 0.75539123,  0.50091719, -0.97755524,  0.09933231,  0.75138712,
         -1.66940528],
        [ 0.54336019, -0.66262376,  0.57059867, -0.76325916, -1.8048821 ,
         -1.62754244],
        [ 0.04808495,  0.2597225 , -0.90431663,  0.63859246, -1.66152006,
         -0.0660798 ]]),
 array([0, 0, 1, 0, 0]))

In [81]:
artificial_x, artificial_y = generate_data(n=1000, b=1, k=20)

### Testing different estimations schemas 

#### Real

##### Refitting

In [134]:
# logistic regression
logistic = LogisticRegression()
logistic.fit(x_real, y_real)
print(
    "Logistic regression accuracy: ",
    accuracy_score(y_real, logistic.predict(x_real)),
)

# decision tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(x_real, y_real)
print("Decision tree accuracy: ", accuracy_score(y_real, tree.predict(x_real)))

Logistic regression accuracy:  0.7294372294372294
Decision tree accuracy:  0.7705627705627706


##### Train test split

In [135]:
# splot data
x_train, x_test, y_train, y_test = train_test_split(x_real, y_real, test_size=0.2)


# logistic regression
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
print(
    "Logistic regression accuracy: ",
    accuracy_score(y_test, logistic.predict(x_test)),
)

# decision tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(x_train, y_train)
print("Decision tree accuracy: ", accuracy_score(y_test, tree.predict(x_test)))

Logistic regression accuracy:  0.7419354838709677
Decision tree accuracy:  0.7204301075268817


##### 10-fold CV

In [136]:
# cross validation
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)
print(
    "Logistic regression cross validation: ",
    cross_val_score(logistic, x_real, y_real, cv=10).mean(),
)
print(
    "Decision tree cross validation: ",
    cross_val_score(tree, x_real, y_real, cv=10).mean(),
)

Logistic regression cross validation:  0.7100370027752081
Decision tree cross validation:  0.6925994449583719


##### Bootstrap method

In [137]:
def train_test_split_bootstrap_OOB(X, y):
    sample = X.sample(frac=1, replace=True)
    mask = X.index.difference(sample.index)
    return X.loc[sample.index], y.loc[sample.index], X.loc[mask], y.loc[mask]

In [138]:
x, y = generate_data(n=5, b=2, k=1)
x, y = pd.DataFrame(x), pd.Series(y)
train_test_split_bootstrap_OOB(x, y)

(          0         1         2         3         4         5
 2 -0.331385 -0.921409 -1.345431 -1.635730  0.448238 -0.045992
 3 -0.259416  1.430976 -0.651102 -0.498177  1.390714 -0.340949
 3 -0.259416  1.430976 -0.651102 -0.498177  1.390714 -0.340949
 1  0.190660  0.526651 -0.170145 -0.765295  0.569046  0.142201
 3 -0.259416  1.430976 -0.651102 -0.498177  1.390714 -0.340949,
 2    0
 3    0
 3    0
 1    0
 3    0
 dtype: int64,
           0         1         2         3         4         5
 0  1.047685 -0.218946 -1.307005 -0.246818 -0.509992  1.464295
 4  0.750567  0.083917 -0.246543  1.099561 -0.046713  1.741873,
 0    0
 4    1
 dtype: int64)

In [139]:
def bootstrap_OOB_accuracy(X, y, model):
    accuracies = []
    for _ in range(200):
        x_train, y_train, x_test, y_test = train_test_split_bootstrap_OOB(X, y)
        model.fit(x_train, y_train)
        accuracies.append(accuracy_score(y_test, model.predict(x_test)))
    return np.mean(accuracies)

In [140]:
# bootstrap OOB
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)

print(
    "Logistic regression bootstrap OOB: ",
    bootstrap_OOB_accuracy(x_real, y_real, logistic),
)
print(
    "Decision tree bootstrap OOB: ",
    bootstrap_OOB_accuracy(x_real, y_real, tree),
)

Logistic regression bootstrap OOB:  0.7066927724830748
Decision tree bootstrap OOB:  0.675604665859535


##### Bootstrap 0.632

In [141]:
def bootstrap_632_accuracy(X, y, model):
    accuracies = []
    for _ in range(200):
        x_train, y_train, x_test, y_test = train_test_split_bootstrap_OOB(X, y)
        model.fit(x_train, y_train)
        accuracies.append(
            0.632 * accuracy_score(y_test, model.predict(x_test))
            + 0.368 * accuracy_score(y_train, model.predict(x_train))
        )
    return np.mean(accuracies)

In [124]:
# bootstrap .632
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)

print(
    "Logistic regression bootstrap .632: ",
    bootstrap_632_accuracy(x_real, y_real, logistic),
)
print(
    "Decision tree bootstrap .632: ",
    bootstrap_632_accuracy(x_real, y_real, tree),
)

Logistic regression bootstrap .632:  0.7198314769755971
Decision tree bootstrap .632:  0.7111065163826615


#### Artificial

##### Refitting

In [130]:
# logistic regression
logistic = LogisticRegression()
logistic.fit(artificial_x, artificial_y)
print(
    "Logistic regression accuracy: ",
    accuracy_score(artificial_y, logistic.predict(artificial_x)),
)

# decision tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(artificial_x, artificial_y)
print(
    "Decision tree accuracy: ", accuracy_score(artificial_y, tree.predict(artificial_x))
)

Logistic regression accuracy:  0.785
Decision tree accuracy:  0.706


##### Train test split

In [133]:
# splot data
x_train, x_test, y_train, y_test = train_test_split(
    artificial_x, artificial_y, test_size=0.2
)


# logistic regression
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
print(
    "Logistic regression accuracy: ",
    accuracy_score(y_test, logistic.predict(x_test)),
)

# decision tree
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(x_train, y_train)
print("Decision tree accuracy: ", accuracy_score(y_test, tree.predict(x_test)))

Logistic regression accuracy:  0.73
Decision tree accuracy:  0.63


##### 10-fold CV

In [148]:
# cross validation
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)
print(
    "Logistic regression cross validation: ",
    cross_val_score(logistic, artificial_x, artificial_y, cv=10).mean(),
)
print(
    "Decision tree cross validation: ",
    cross_val_score(tree, artificial_x, artificial_y, cv=10).mean(),
)

Logistic regression cross validation:  0.7750000000000001
Decision tree cross validation:  0.64


##### Bootstrap method

In [149]:
# bootstrap OOB
artificial_x, artificial_y = pd.DataFrame(artificial_x), pd.Series(artificial_y)
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)

print(
    "Logistic regression bootstrap OOB: ",
    bootstrap_OOB_accuracy(artificial_x, artificial_y, logistic),
)
print(
    "Decision tree bootstrap OOB: ",
    bootstrap_OOB_accuracy(artificial_x, artificial_y, tree),
)

Logistic regression bootstrap OOB:  0.7603352476625588
Decision tree bootstrap OOB:  0.6330801334890027


##### Bootstrap 0.632

In [150]:
# bootstrap .632
logistic = LogisticRegression()
tree = DecisionTreeClassifier(max_depth=3)

print(
    "Logistic regression bootstrap .632: ",
    bootstrap_632_accuracy(x_real, y_real, logistic),
)
print(
    "Decision tree bootstrap .632: ",
    bootstrap_632_accuracy(x_real, y_real, tree),
)

Logistic regression bootstrap .632:  0.7181970996959655
Decision tree bootstrap .632:  0.7091076782867579


### ROC curves

In [None]:
n = [100, 200, 1000]