In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import somersd
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
dat = pd.read_csv("Final_Data.csv",low_memory=False)

In [3]:
# Define categorical and numerical columns
cat_cols = ['STRUCTURE_KIND_043A', 'STRUCTURE_TYPE_043B', 'DECK_STRUCTURE_TYPE_107', 'LOWEST_RATING']
num_cols = ['ADT_029', 'MAX_SPAN_LEN_MT_048', 'IMP_LEN_MT_076', 'DECK_AREA', 'TIC', 'Installation_Year']

# One-hot encode categorical columns
cat_encoded = pd.get_dummies(dat, columns=cat_cols, dtype=float)

# Initialize the MinMaxScaler for numerical columns
scaler = MinMaxScaler()

# Apply MinMax scaling to the numerical features
cat_encoded[num_cols] = scaler.fit_transform(cat_encoded[num_cols])

test_data = cat_encoded[(cat_encoded['CollectionYear'].isin([2021, 2022]))]

# Filter the training dataset
train_data = cat_encoded[(cat_encoded['CollectionYear'] > 2010) &
                        (cat_encoded['CollectionYear'] <= 2020)]

# Display summary
print("Training Data:")
print(train_data.shape)
print("\nTesting Data (Non-Identical Ratings):")
print(test_data.shape)


Training Data:
(99975, 55)

Testing Data (Non-Identical Ratings):
(20941, 55)


In [4]:
cluster_counts = train_data.groupby('hierarchical_cluster')['STRUCTURE_NUMBER_008'].nunique()

print(cluster_counts)
dat.columns

hierarchical_cluster
1      51
2    1928
3    1285
4    8058
5      25
6       5
7     414
Name: STRUCTURE_NUMBER_008, dtype: int64


Index(['STRUCTURE_NUMBER_008', 'ADT_029', 'STRUCTURE_KIND_043A',
       'STRUCTURE_TYPE_043B', 'MAX_SPAN_LEN_MT_048', 'DECK_STRUCTURE_TYPE_107',
       'IMP_LEN_MT_076', 'DECK_AREA', 'LOWEST_RATING', 'NEXT_LOWEST_RATING',
       'CollectionYear', 'Installation_Year', 'TIC', 'hierarchical_cluster'],
      dtype='object')

In [5]:
# Set the total number of samples you want to draw
total_samples = 10

In [6]:
import pandas as pd

# Assuming 'train_data' and 'total_samples' are predefined
total_bridges_in_train_data = train_data['STRUCTURE_NUMBER_008'].nunique()
cluster_counts = train_data.groupby('hierarchical_cluster')['STRUCTURE_NUMBER_008'].nunique()
proportional_weights = cluster_counts / total_bridges_in_train_data

# Calculate the number of bridges to sample from each cluster based on proportional weights
bridges_per_cluster = (proportional_weights * total_samples).round().astype(int)
print("Initial bridges per cluster:", bridges_per_cluster)

# Sample bridges from each cluster
def sample_bridges(group):
    n = bridges_per_cluster.get(group.name, 0)
    return group.sample(n=n) if n <= len(group) else group.sample(n=len(group))

sampled_bridges = train_data.groupby('hierarchical_cluster').apply(sample_bridges).reset_index(drop=True)

# Extract Structure Number of sampled bridges
sampled_structure_numbers = sampled_bridges['STRUCTURE_NUMBER_008'].unique()

# Create initial_set with records of sampled bridges
initial_set = train_data[train_data['STRUCTURE_NUMBER_008'].isin(sampled_structure_numbers)]

# Create pool_set with the rest of the records
pool_set = train_data[~train_data['STRUCTURE_NUMBER_008'].isin(sampled_structure_numbers)]

# Print shapes of the datasets
print("Shape of initial_set:", initial_set.shape)
print("Shape of pool_set:", pool_set.shape)



Initial bridges per cluster: hierarchical_cluster
1    0
2    2
3    1
4    7
5    0
6    0
7    0
Name: STRUCTURE_NUMBER_008, dtype: int64
Shape of initial_set: (81, 55)
Shape of pool_set: (99894, 55)


In [7]:
n_structures_per_query = 10
n_queries = 10

## data preprocessing

In [8]:
print(type(proportional_weights))


<class 'pandas.core.series.Series'>


In [9]:
pool_set.shape

(99894, 55)

In [10]:
initial_set = initial_set.drop(columns=['CollectionYear'])
pool_set = pool_set.drop(columns=['CollectionYear'])
test_data = test_data.drop(columns=['CollectionYear'])

In [11]:
initial_set.loc[:, 'NEXT_LOWEST_RATING'] = initial_set['NEXT_LOWEST_RATING'] - 3
pool_set.loc[:, 'NEXT_LOWEST_RATING'] = pool_set['NEXT_LOWEST_RATING'] - 3
test_data.loc[:, 'NEXT_LOWEST_RATING'] = test_data['NEXT_LOWEST_RATING'] - 3

In [12]:
print(initial_set.shape)
print(pool_set.shape)
print(test_data.shape)

(81, 54)
(99894, 54)
(20941, 54)


## Fast KELMOR

In [13]:
def incomplete_cholesky(g_row,g_diag,K, S, N, epsilon = 1e-5):

  pi = list(range(N))
  P = np.zeros([S,N])
  D = np.copy(g_diag())
  err = np.sum(np.abs(D))

  s = 0

  while(s < S) and (err > epsilon):
    i = s + np.argmax([D[pi[j]] for j in range(s,N)])

    # line 6 : swap pi[s] and pi[i]

    tmp = pi[s]
    pi[s] = pi[i]
    pi[i] = tmp

    # line 7 :
    P[s,pi[s]] = np.sqrt(D[pi[s]])
    KX = g_row(pi[s])
    for i in range(s+1, N):
      if s > 0:
        inner_p = np.inner(P[:s,pi[s]], P[:s,pi[i]])
      else:
        inner_p = 0

      P[s,pi[i]] = (KX[pi[i]] - inner_p) / P[s,pi[s]]
      D[pi[i]] -=  pow(P[s,pi[i]],2)
    err = np.sum([D[pi[i]] for i in range(s+1,N)])
    s = s + 1

  P = P[:s,:]

  return P

In [14]:
import numpy as np
from scipy.stats import kendalltau, somersd
from sklearn.metrics.pairwise import pairwise_kernels


class kelmor():
    def __init__(self, kernel, C):
        self.kernel = kernel
        self.C = C

    def fit(self, X, y):
        self.X = X
        self.y = y
        N, F = X.shape
        self.t = np.array([[(j-q)**2 for j in range(7)] for q in range(7)])
        T = self.t[y, :]
        K = pairwise_kernels(X, metric=self.kernel)
        g_row = lambda i: K[i, :]
        g_diag = lambda: np.diag(K).copy()
        N = K.shape[0]
        S = 500
        P = incomplete_cholesky(g_row, g_diag, K, S, N, epsilon=1e-5)
        P = np.transpose(P)
        z = self.C * T
        u = np.matmul(np.transpose(P), T)
        s = np.matmul(np.linalg.inv(np.eye(P.shape[1]) + self.C * np.matmul(np.transpose(P), P)), u)
        self.beta = z - self.C**2 * np.matmul(P, s)
        return self

    def inference(self, X):
        K = pairwise_kernels(X, self.X, metric=self.kernel)
        fx = np.dot(K, self.beta)
        self.y_hat = np.argmin(np.linalg.norm(fx[:, None] - self.t, ord=1, axis=2), axis=1)
        NR = -np.linalg.norm(fx[:, None] - self.t, ord=1, axis=2)
        self.probs = self.soft_max(NR)
        return self.y_hat, self.probs

    def soft_max(self, NR):
        P = np.exp(NR) / (np.sum(np.exp(NR), axis=1)[:, np.newaxis])
        return P

In [15]:
#kel = kelmor(kernel = "rbf",C = 0.5)
kel = kelmor(kernel = "linear",C = 5) # best by defined grid search

In [16]:
# Define the rps_value function (correct version dont touch)
def rps_value(y_pred_prob, y_true):
    """
    Calculate Ranked Probability Score (RPS) for ordinal predictions.

    Parameters
    ----------
    y_pred_prob : numpy array
        Array of shape (num_samples, num_categories) containing predicted probabilities for each ordinal category.
    y_true : numpy array
        Array of shape (num_samples,) containing true ordinal category values.

    Returns
    -------
    float
        Mean RPS value across all samples.
    """
    # Convert y_true to numpy array for positional indexing
    y_true = np.array(y_true)
    num_samples, num_cat = y_pred_prob.shape
    # Ensure the true labels match the number of samples
    if num_samples != len(y_true):
        raise ValueError(f"Number of samples does not match: {num_samples}, {len(y_true)}")


    # Ensure the true labels match the number of samples
    if num_samples != len(y_true):
        raise ValueError(f"Number of samples does not match: {num_samples}, {len(y_true)}")

    # Compute CDFs for predicted probabilities
    y_pred_prob_cdf = np.cumsum(y_pred_prob, axis=1)

    # Compute CDFs for true labels
    y_true_cdf = np.zeros_like(y_pred_prob_cdf)
    for k in range(num_samples):
        y_true_cdf[k, :] = [(j >= y_true[k]) * 1 for j in range(num_cat)]

    # Compute RPS for each sample
    rps_per_sample = np.sum((y_pred_prob_cdf - y_true_cdf) ** 2, axis=1) / (num_cat - 1)

    # Return the mean RPS value across all samples
    return np.round(np.mean(rps_per_sample),4)


## training

In [17]:
# Define your feature columns (excluding target, cluster, and structure number)
feature_columns = [col for col in initial_set.columns if col not in ['NEXT_LOWEST_RATING', 'hierarchical_cluster', 'STRUCTURE_NUMBER_008']]

# Prepare training and testing data
X_train = initial_set[feature_columns]
y_train = initial_set['NEXT_LOWEST_RATING'].values

X_test = test_data[feature_columns]
y_test = test_data['NEXT_LOWEST_RATING'].values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

# Train the initial model
kel.fit(X_train.values, y_train)

# Generate predicted probabilities for the initial training set
_, y_pred_prob_train = kel.inference(X_train.values)


X_train shape: (81, 51)
y_train shape: (81,)
X_test shape: (20941, 51)
y_test shape: (20941,)


In [18]:
def entropy_sampling_for_structure(kel, pool_set, feature_columns, n_samples, cluster_proportional):
    cluster_entropy = {}
    for cluster in pool_set['hierarchical_cluster'].unique():
        subset = pool_set[pool_set['hierarchical_cluster'] == cluster]
        entropies = []
        for structure_number in subset['STRUCTURE_NUMBER_008'].unique():
            structure_subset = subset[subset['STRUCTURE_NUMBER_008'] == structure_number]
            _, probabilities = kel.inference(structure_subset[feature_columns].values)
            entropy = -np.sum(probabilities * np.log(probabilities + 1e-5), axis=1).mean()
            entropies.append((structure_number, entropy))

        # Sort structures in this cluster by descending entropy
        entropies.sort(key=lambda x: x[1], reverse=True)
        cluster_entropy[cluster] = entropies

    # Select structures from each cluster based on their entropy and proportional needs
    selected_structures = []
    for cluster, entropies in cluster_entropy.items():
        n_to_select = max(1, int((n_samples * cluster_proportional.get(cluster, 0))+.5))
        selected_structures.extend([structure[0] for structure in entropies[:n_to_select]])

    return selected_structures[:n_samples]


In [19]:
cluster_counts = pool_set['hierarchical_cluster'].value_counts()
total_bridges = cluster_counts.sum()
cluster_proportional = (cluster_counts / total_bridges).to_dict()

In [20]:
from sklearn.metrics import confusion_matrix

def run_sampling_method(
    kel,
    X_train, y_train,
    pool_set,
    X_test, y_test,
    n_queries,
    feature_columns,
    n_structures_per_query
):
    # Initialize lists to store metrics at each step
    RPS_entropy_sampling = []
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    somers_d_list = []
    confusion_matrices = []  # List to store confusion matrices

    # Initial model training and evaluation
    kel.fit(X_train.values, y_train)
    _, y_pred_probs = kel.inference(X_test.values)
    
    # Convert probabilities to class labels
    y_pred = np.argmax(y_pred_probs, axis=1)

    # Compute metrics
    initial_rps = rps_value(y_pred_probs, y_test)
    initial_accuracy = accuracy_score(y_test, y_pred)
    initial_precision = precision_score(y_test, y_pred, average="weighted")
    initial_recall = recall_score(y_test, y_pred, average="weighted")
    initial_f1 = f1_score(y_test, y_pred, average="weighted")
    initial_somers_d = somersd(y_test, y_pred).correlation
    initial_confusion_matrix = confusion_matrix(y_test, y_pred)

    # Store initial metrics
    RPS_entropy_sampling.append(initial_rps)
    accuracy_list.append(initial_accuracy)
    precision_list.append(initial_precision)
    recall_list.append(initial_recall)
    f1_list.append(initial_f1)
    somers_d_list.append(initial_somers_d)
    confusion_matrices.append(initial_confusion_matrix)

    print(f"Initial Metrics - RPS: {initial_rps:.4f}, Accuracy: {initial_accuracy:.4f}, "
          f"Precision: {initial_precision:.4f}, Recall: {initial_recall:.4f}, "
          f"F1-score: {initial_f1:.4f}, Somers' D: {initial_somers_d:.4f}")
    print(f"Initial Confusion Matrix:\n{initial_confusion_matrix}")

    for query_index in range(1, n_queries + 1):
        print(f"\n--- Query {query_index} ---")

        # Calculate cluster proportions for entropy sampling
        cluster_counts = pool_set['hierarchical_cluster'].value_counts()
        total_bridges = cluster_counts.sum()
        cluster_proportional = (cluster_counts / total_bridges).to_dict()

        high_entropy_structure_numbers = entropy_sampling_for_structure(kel, pool_set, feature_columns, n_structures_per_query, cluster_proportional)

        if not high_entropy_structure_numbers:
            print("No new samples selected based on entropy. Stopping active learning.")
            break

        # Select new samples based on high entropy structure numbers
        new_samples = pool_set[pool_set['STRUCTURE_NUMBER_008'].isin(high_entropy_structure_numbers)]
        count = new_samples.shape[0]
        print(f"Selected {len(high_entropy_structure_numbers)} structures with total {count} samples")

        # Update pool set by removing selected samples
        pool_set = pool_set[~pool_set['STRUCTURE_NUMBER_008'].isin(high_entropy_structure_numbers)].reset_index(drop=True)

        # Extract features and labels from new_samples
        X_new = new_samples[feature_columns]
        y_new = new_samples['NEXT_LOWEST_RATING'].values

        # Append new samples to the training set
        X_train = pd.concat([X_train, X_new], ignore_index=True)
        y_train = np.concatenate([y_train, y_new])

        # Retrain the model with the updated training set
        kel.fit(X_train.values, y_train)

        # Evaluate the model on the test set
        _, y_pred_probs = kel.inference(X_test.values)

        # Convert probabilities to class labels
        y_pred = np.argmax(y_pred_probs, axis=1)

        # Compute metrics
        rps = rps_value(y_pred_probs, y_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted")
        recall = recall_score(y_test, y_pred, average="weighted")
        f1 = f1_score(y_test, y_pred, average="weighted")
        somers_d_value = somersd(y_test, y_pred).correlation
        confusion_mat = confusion_matrix(y_test, y_pred)

        # Store results
        RPS_entropy_sampling.append(rps)
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        somers_d_list.append(somers_d_value)
        confusion_matrices.append(confusion_mat)

        # Print results for the query
        print(f"Query {query_index}: RPS = {rps:.4f}, Accuracy = {accuracy:.4f}, "
              f"Precision = {precision:.4f}, Recall = {recall:.4f}, "
              f"F1-score = {f1:.4f}, Somers' D: {somers_d_value:.4f}")
        print(f"Confusion Matrix for Query {query_index}:\n{confusion_mat}")

    return {
        "RPS": RPS_entropy_sampling,
        "Accuracy": accuracy_list,
        "Precision": precision_list,
        "Recall": recall_list,
        "F1-score": f1_list,
        "Somers' D": somers_d_list,
        "Confusion Matrices": confusion_matrices  # Add confusion matrices to the return
    }

# Run the function and store results in a dictionary
metrics_entropy_sampling = run_sampling_method(
    kel=kel,  # Model instance
    X_train=X_train,
    y_train=y_train,
    pool_set=pool_set,
    X_test=X_test,
    y_test=y_test,
    n_queries=n_queries,
    feature_columns=feature_columns,  # List of columns to be used for feature extraction
    n_structures_per_query=n_structures_per_query  # Number of unique structures to select per query
)

# Extract individual lists
rps_list = metrics_entropy_sampling["RPS"]
accuracy_list = metrics_entropy_sampling["Accuracy"]
precision_list = metrics_entropy_sampling["Precision"]
recall_list = metrics_entropy_sampling["Recall"]
f1_list = metrics_entropy_sampling["F1-score"]
somers_d_list = metrics_entropy_sampling["Somers' D"]
confusion_matrices = metrics_entropy_sampling["Confusion Matrices"]


  _warn_prf(average, modifier, msg_start, len(result))


Initial Metrics - RPS: 0.1095, Accuracy: 0.5485, Precision: 0.5231, Recall: 0.5485, F1-score: 0.5223, Somers' D: 0.4480
Initial Confusion Matrix:
[[   0    0   14  229   34    8    1]
 [   0    0  111 1306  250   76    0]
 [   0    0 3660  981  749   43    2]
 [   0    0   60 4342 1069  332    6]
 [   0    0  512 2428 1646  145   29]
 [   0    0    0  249  678  944  142]
 [   0    0    0    0    0    0  895]]

--- Query 1 ---
Selected 10 structures with total 59 samples
Query 1: RPS = 0.0421, Accuracy = 0.7704, Precision = 0.7392, Recall = 0.7704, F1-score = 0.7447, Somers' D: 0.8233
Confusion Matrix for Query 1:
[[ 121  109   31   23    1    0    1]
 [   0   52 1187  467   37    0    0]
 [   0  156 4341  915   19    2    2]
 [   0    0  279 5256  256   12    6]
 [   0    0   38  743 3753  197   29]
 [   0    0    0    1  154 1716  142]
 [   0    0    0    0    0    2  893]]

--- Query 2 ---
Selected 10 structures with total 80 samples
Query 2: RPS = 0.0156, Accuracy = 0.9090, Precisio

In [28]:
excel = pd.read_excel('entropy_sampling_metrics.xlsx')
excel.head()

Unnamed: 0,Query,RPS,Accuracy,Precision,Recall,F1-score,Somers' D
0,0,0.1095,0.548541,0.523076,0.548541,0.522272,0.447975
1,1,0.0421,0.770355,0.739178,0.770355,0.744685,0.823265
2,2,0.0156,0.908982,0.910421,0.908982,0.908894,0.944848
3,3,0.0123,0.931379,0.932658,0.931379,0.93145,0.961144
4,4,0.0106,0.943985,0.94572,0.943985,0.944152,0.967761
