In [1]:
from discovery_utils.getters.horizon_scout import get_training_data

  from .autonotebook import tqdm as notebook_tqdm


In [285]:
import pandas as pd
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [81]:
def make_train_val_datasets(
    dataset: pd.DataFrame,
    random_state=int,
    training_frac=float,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the input DataFrame into non-overlapping training, validation, and test datasets.

    First shuffle the dataset using a specified random state.
    Then partitions the dataset into training and validation sets.

    Args:
        dataset (pd.DataFrame): The complete dataset to be split.
        random_state (int): A seed used by the random number generator for shuffling the data.
        training_frac (float): The fraction of the dataset to allocate to the training set.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
        (training_dataset, val_dataset).
    """
    shuffled_dataset = dataset.sample(frac=1, random_state=random_state)
    train_size = int(len(shuffled_dataset) * training_frac)
    train_dataset = shuffled_dataset[:train_size]
    val_dataset = shuffled_dataset[train_size:]
    return (train_dataset.reset_index(drop=True), val_dataset.reset_index(drop=True))


def make_train_X_train_y_val_X_val_y(
    train_dataset: pd.DataFrame,
    val_dataset: pd.DataFrame
    ) -> Tuple[np.array, np.array, np.array, np.array]:
    train_X = np.vstack(train_dataset.embedding.values)
    train_y = train_dataset.relevant.values
    val_X = np.vstack(val_dataset.embedding.values)
    val_y = val_dataset.relevant.values
    return (train_X, train_y, val_X, val_y)

    
def print_confusion_matrix(actual, predictions):
    cm = confusion_matrix(actual, predictions)

    # Labels for the classes
    classes = ['Neg', 'Pos']

    # Print the confusion matrix with labels
    print('            Predicted:')
    print('            | {:<4} | {:<4} |'.format(*classes))
    print('-----------------------------')
    for i, row in enumerate(cm):
        print('Actual {:<4} | {:<4} | {:<4} |'.format(classes[i], *row))
    print()  # Print a new line for better separation
    
    # Calculate precision, recall, and accuracy
    precision = precision_score(actual, predictions, average='binary')
    recall = recall_score(actual, predictions, average='binary')
    accuracy = accuracy_score(actual, predictions)

    # Print the computed metrics
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Accuracy: {accuracy:.3f}")
    

In [82]:
# Make train and val datasets for AHL
ahl_training_data = get_training_data("AHL")
ahl_train, ahl_val = make_train_val_datasets(ahl_training_data, 13, 0.85)
ahl_train_X, ahl_train_y, ahl_val_X, ahl_val_y = make_train_X_train_y_val_X_val_y(ahl_train, ahl_val)

In [83]:
# Linear SVM performance AHL
linear_svm_ahl_classifier = SVC(kernel='linear', C=2)
linear_svm_ahl_classifier.fit(ahl_train_X, ahl_train_y)
linear_svm_ahl_preds = linear_svm_ahl_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, linear_svm_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 229  | 22   |
Actual Pos  | 13   | 252  |

Precision: 0.920
Recall: 0.951
Accuracy: 0.932


In [125]:
# Poly SVM performance AHL
poly_svm_ahl_classifier = SVC(kernel='poly', degree=4, C=0.5, gamma='scale', coef0=0.5)
poly_svm_ahl_classifier.fit(ahl_train_X, ahl_train_y)
poly_svm_ahl_preds = poly_svm_ahl_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, poly_svm_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 243  | 8    |
Actual Pos  | 13   | 252  |

Precision: 0.969
Recall: 0.951
Accuracy: 0.959


In [205]:
# LR performance AHL
lr_ahl_classifier = LogisticRegression(solver='lbfgs', max_iter=50, C=1)
lr_ahl_classifier.fit(ahl_train_X, ahl_train_y)
lr_ahl_preds = lr_ahl_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, lr_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 236  | 15   |
Actual Pos  | 19   | 246  |

Precision: 0.943
Recall: 0.928
Accuracy: 0.934


In [276]:
# GB performance AHL
gb_ahl_classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=3, random_state=1)
gb_ahl_classifier.fit(ahl_train_X, ahl_train_y)
gb_ahl_preds = gb_ahl_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, gb_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 232  | 19   |
Actual Pos  | 17   | 248  |

Precision: 0.929
Recall: 0.936
Accuracy: 0.930


In [300]:
# KNN performance AHL
knn_ahl_classifier = KNeighborsClassifier(n_neighbors=15)
knn_ahl_classifier.fit(ahl_train_X, ahl_train_y)
knn_ahl_preds = knn_ahl_classifier.predict(ahl_val_X)
print_confusion_matrix(ahl_val_y, knn_ahl_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 229  | 22   |
Actual Pos  | 7    | 258  |

Precision: 0.921
Recall: 0.974
Accuracy: 0.944


In [84]:
# Make train and val datasets for ASF
asf_training_data = get_training_data("ASF")
asf_train, asf_val = make_train_val_datasets(asf_training_data, 13, 0.85)
asf_train_X, asf_train_y, asf_val_X, asf_val_y = make_train_X_train_y_val_X_val_y(asf_train, asf_val)

In [126]:
# Linear SVM performance ASF
linear_svm_asf_classifier = SVC(kernel='linear', C=0.5)
linear_svm_asf_classifier.fit(asf_train_X, asf_train_y)
linear_svm_asf_preds = linear_svm_asf_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, linear_svm_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 218  | 15   |
Actual Pos  | 16   | 225  |

Precision: 0.938
Recall: 0.934
Accuracy: 0.935


In [153]:
# Poly SVM performance ASF
poly_svm_asf_classifier = SVC(kernel='poly', degree=4, C=0.6, gamma='scale', coef0=0.5)
poly_svm_asf_classifier.fit(asf_train_X, asf_train_y)
poly_svm_asf_preds = poly_svm_asf_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, poly_svm_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 221  | 12   |
Actual Pos  | 17   | 224  |

Precision: 0.949
Recall: 0.929
Accuracy: 0.939


In [230]:
# LR performance ASF
lr_asf_classifier = LogisticRegression(solver='newton-cg', penalty="l2", C=1.5)
lr_asf_classifier.fit(asf_train_X, asf_train_y)
lr_asf_preds = lr_asf_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, lr_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 219  | 14   |
Actual Pos  | 15   | 226  |

Precision: 0.942
Recall: 0.938
Accuracy: 0.939


In [278]:
# GB performance ASF
gb_asf_classifier = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=3, random_state=1)
gb_asf_classifier.fit(asf_train_X, asf_train_y)
gb_asf_preds = gb_asf_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, gb_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 213  | 20   |
Actual Pos  | 18   | 223  |

Precision: 0.918
Recall: 0.925
Accuracy: 0.920


In [318]:
# KNN performance ASF
knn_asf_classifier = KNeighborsClassifier(n_neighbors=14)
knn_asf_classifier.fit(asf_train_X, asf_train_y)
knn_asf_preds = knn_asf_classifier.predict(asf_val_X)
print_confusion_matrix(asf_val_y, knn_asf_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 197  | 36   |
Actual Pos  | 4    | 237  |

Precision: 0.868
Recall: 0.983
Accuracy: 0.916


In [314]:
# Make train and val datasets for AFS
afs_training_data = get_training_data("AFS")
afs_train, afs_val = make_train_val_datasets(afs_training_data, 13, 0.85)
afs_train_X, afs_train_y, afs_val_X, afs_val_y = make_train_X_train_y_val_X_val_y(afs_train, afs_val)

In [127]:
# Linear SVM performance AFS
linear_svm_afs_classifier = SVC(kernel='linear', C=0.2)
linear_svm_afs_classifier.fit(afs_train_X, afs_train_y)
linear_svm_afs_preds = linear_svm_afs_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, linear_svm_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 209  | 8    |
Actual Pos  | 5    | 224  |

Precision: 0.966
Recall: 0.978
Accuracy: 0.971


In [166]:
# Poly SVM performance AFS
poly_svm_afs_classifier = SVC(kernel='poly', degree=4, C=0.5, gamma='scale', coef0=0.5)
poly_svm_afs_classifier.fit(afs_train_X, afs_train_y)
poly_svm_afs_preds = poly_svm_afs_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, poly_svm_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 211  | 6    |
Actual Pos  | 6    | 223  |

Precision: 0.974
Recall: 0.974
Accuracy: 0.973


In [266]:
# LR performance AFS
lr_afs_classifier = LogisticRegression(solver='saga', penalty="l2", max_iter=100, C=8)
lr_afs_classifier.fit(afs_train_X, afs_train_y)
lr_afs_preds = lr_afs_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, lr_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 211  | 6    |
Actual Pos  | 8    | 221  |

Precision: 0.974
Recall: 0.965
Accuracy: 0.969


In [284]:
# GB performance AFS
gb_afs_classifier = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=4, random_state=1)
gb_afs_classifier.fit(afs_train_X, afs_train_y)
gb_afs_preds = gb_afs_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, gb_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 207  | 10   |
Actual Pos  | 11   | 218  |

Precision: 0.956
Recall: 0.952
Accuracy: 0.953


In [328]:
# KNN performance AFS
knn_afs_classifier = KNeighborsClassifier(n_neighbors=10)
knn_afs_classifier.fit(afs_train_X, afs_train_y)
knn_afs_preds = knn_afs_classifier.predict(afs_val_X)
print_confusion_matrix(afs_val_y, knn_afs_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 196  | 21   |
Actual Pos  | 4    | 225  |

Precision: 0.915
Recall: 0.983
Accuracy: 0.944
