In [670]:
import numpy as np
import scipy
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score , confusion_matrix

In [671]:
import math

In [672]:

# Read the data from the file
with open("german.data-numeric", "r") as file:
    data_lines = file.readlines()

# Parse each line and append to a list
structured_data = []
for line in data_lines:
    values = line.strip().split()
    structured_data.append([float(val) for val in values])

# Create a NumPy array from the list
data = np.array(structured_data, dtype=np.float64)




In [673]:
def nCp(sigma2 , estimator , X, Y):
    "Negative Cp statistic"
    n, p = X.shape
    Yhat = estimator.predict(X)
    RSS = np.sum((Y - Yhat)**2)
    return -(RSS + 2 * p * sigma2) / n

In [674]:
data.shape

(1000, 25)

In [675]:
count = np.sum(data[:, -1] == 1)
print(count)

700


In [676]:
def delete_rows_with_value(arr, value, num_rows):
    # Assuming arr is a 2D NumPy array
    if len(arr.shape) != 2 or arr.shape[1] < 1:
        raise ValueError("Input must be a 2D array with at least one column.")

    # Find the indices of rows where the final column contains the specified value
    indices_to_delete = np.where(arr[:, -1] == value)[0]

    # Delete the specified number of rows (num_rows) or all rows if there are fewer than num_rows
    rows_to_delete = min(num_rows, len(indices_to_delete))
    new_arr = np.delete(arr, indices_to_delete[:rows_to_delete], axis=0)

    return new_arr

In [677]:
X= data[:,:-1]
y= data[:,-1]

In [678]:
def kernel(x, x_prime, a, l):
    sq_norm = -0.5 * scipy.spatial.distance.cdist(x, x_prime, 'sqeuclidean')
    r=(a**2) * np.exp(sq_norm)
    return r

In [679]:
def calculate_log_loss(y_true, y_prob):
    """
    Calculate Logarithmic Loss (log loss) for binary classification.

    Parameters:
    - y_true: numpy array or list, true binary labels (0 or 1)
    - y_prob: numpy array or list, predicted probabilities for the positive class

    Returns:
    - log_loss_value: float, log loss value
    """
    n = len(y_true)
    
    # Avoid log(0) by clipping predicted probabilities
    epsilon = 1e-15
    y_prob = np.clip(y_prob, epsilon, 1 - epsilon)
    
    # Calculate log loss
    log_loss_value = -np.sum(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob)) / n
    
    return log_loss_value



In [680]:
def backward_stepwise_selection(X_train, X_test, y_train, y_test):
    n, p = X_train.shape
    models = []
    bestModel = None
    minLogLoss = 10000

    for k in range(p-1, -1, -1):
        bestModelp = None
        bestAcc = 0
        for i in range(0, k+1):
            newX_train = np.delete(X_train, i, axis=1)
            newX_test = np.delete(X_test, i, axis=1)
            predicted_probabilities = GP(newX_train, y_train, newX_test, kernel)
            predicted_labels = (predicted_probabilities > 0.5).astype(int)
            acc = accuracy_score(y_test, predicted_labels)
            if acc > bestAcc:
                bestAcc = acc
                bestModelp = predicted_probabilities
        if bestModelp is not None:
            models.append(bestModelp)

    for model in models:
        predicted_labels = (model > 0.3).astype(int)
        logLoss = calculate_log_loss(y_test, predicted_labels)
        if logLoss < minLogLoss:
            minLogLoss = logLoss
            bestModel = model

    return bestModel


In [681]:
# Gaussian process posterior
def GP(X1, y1, X2, kernel_func):
    """
    Calculate the posterior mean and covariance matrix for y2
    based on the corresponding input X2, the observations (y1, X1), 
    and the prior kernel function.
    """
    a=1
    l=0.1
    # Kernel of the observations
    Σ11 = kernel_func(X1, X1,a,l) 
    # Kernel of observations vs to-predict
    Σ12 = kernel_func(X1, X2,a,l)
    # Solve
    #solved = scipy.linalg.solve(Σ11, Σ12, assume_a='pos').T
    solved, residuals, rank, s = np.linalg.lstsq(Σ11, Σ12, rcond=None)
    # Compute posterior mean
    μ2 = solved.T @ y1
    probabilities = 1 / (1 + np.exp(-μ2))
    return probabilities
    # Compute the posterior covariance
    #Σ22 = kernel_func(X2, X2,2,0.5)
    #Σ2 = Σ22 - (solved @ Σ12)
    #return μ2, Σ2  # mean, covariance

In [682]:
X_train, X_test, y_train, y_test = train_test_split(X, (y == 1).astype(int), test_size=0.33, random_state=0)
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_test = X_test.astype(float)


In [683]:
#μ2, Σ2 =GP(X_train, y_train, X_test, kernel)
predicted_probabilities = backward_stepwise_selection(X_train, X_test, y_train, y_test)

In [684]:
def calculate_recall(y_true, y_pred):
    """
    Calculate recall.
    
    Parameters:
    - y_true: array-like, true labels
    - y_pred: array-like, predicted labels
    
    Returns:
    - recall: float, recall score
    """
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))
    
    if (true_positives + false_negatives) == 0:
        return 0.0  # Handle the case where denominator is zero
    
    recall = true_positives / (true_positives + false_negatives)
    return recall


In [685]:
# Threshold the probabilities to get binary predictions
predicted_labels = (predicted_probabilities > 0.5).astype(int)
# Compute accuracy
accuracy = np.mean(y_test == predicted_labels)
print("accurecy",accuracy)
print("recall",calculate_recall(y_test,predicted_labels))
print(confusion_matrix(y_test,predicted_labels))

accurecy 0.7212121212121212
recall 0.9786324786324786
[[  9  87]
 [  5 229]]


In [686]:
predicted_probabilities = GP(X_train, y_train, X_test, kernel)
# Threshold the probabilities to get binary predictions
predicted_labels = (predicted_probabilities > 0.5).astype(int)
# Compute accuracy
accuracy = np.mean(y_test == predicted_labels)
print("accuracy",accuracy)
print("recall",calculate_recall(y_test,predicted_labels))
print(confusion_matrix(y_test,predicted_labels))

accuracy 0.7
recall 0.9017094017094017
[[ 20  76]
 [ 23 211]]


In [687]:
import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Define the Gaussian Process kernel (Radial Basis Function kernel here)
kernel = 1.0 * RBF(length_scale=1.0)

# Create the Gaussian Process Classifier
gp_classifier = GaussianProcessClassifier(kernel=kernel, random_state=42)

# Fit the classifier to the training data
gp_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = gp_classifier.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(confusion_matrix(y_pred,y_test))

Accuracy: 0.7727272727272727
[[ 52  31]
 [ 44 203]]
