In [2]:
import numpy as np

In [3]:
y_tilda = np.load("Y_tilda.npy")
P =  np.load("P.npy")
train_labels = np.load("train_labels.npy")
train_dataset = np.load("train_dataset.npy")

In [8]:
X = train_dataset @ P
train = X[:int(0.3 * X.shape[0])]
test = X[int(0.3 * X.shape[0]):]
train_labels_ = train_labels[:int(0.3 * train_labels.shape[0])]
test_labels_ = train_labels[int(0.3 * train_labels.shape[0]):]

In [5]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer

In [9]:
knn_params = {'n_neighbors': [5, 10,]}
xgb_params = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}

In [10]:
best_models = {}
for i in range(train_labels.shape[1]):
    # Get the labels for the current column
    labels = train_labels[:, i]

    # Run KNN with grid search
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_params, scoring=make_scorer(f1_score, average='micro'))
    knn_grid.fit(train, train_labels_)
    best_models[f'KNN_{i}'] = knn_grid.best_estimator_

    # Run XGBoost with grid search
    xgb = XGBClassifier()
    xgb_grid = GridSearchCV(xgb, xgb_params, scoring=make_scorer(f1_score, average='micro'))
    xgb_grid.fit(train, train_labels_)
    best_models[f'XGBoost_{i}'] = xgb_grid.best_estimator_

    # Run Random Forest with grid search
    rf = RandomForestClassifier()
    rf_grid = GridSearchCV(rf, rf_params, scoring=make_scorer(f1_score, average='micro'))
    rf_grid.fit(train, train_labels_)
    best_models[f'RandomForest_{i}'] = rf_grid.best_estimator_

In [14]:
best_model = None
best_score = 0.0

for model_name, model in best_models.items():
    # Make predictions using the model
    predictions = model.predict(test)

    # Calculate the F1 score for each label separately
    scores = []
    for i in range(test_labels_.shape[1]):
        label_predictions = predictions[:, i]
        label_true = test_labels_[:, i]
        label_score = f1_score(label_true, label_predictions)
        scores.append(label_score)

    # Calculate the average F1 score across all labels
    score = np.mean(scores)

    # Update the best model if the score is higher
    if score > best_score:
        best_model = model_name
        best_score = score

print(f"The best model is {best_model} with an average F1 score of {best_score}")

The best model is XGBoost_0 with an average F1 score of 0.7277486761864922


In [19]:
from evaluation.eval import evaluate_metrics
hl, maf1, mif1, rl, ap, oe, cov = evaluate_metrics(test_labels_, xgb_grid.predict(test))
print("Hamming Loss (HL'):", 1- hl)
print("Macro F1-score (MaF1):", maf1)
print("Micro F1 (MiF1):", mif1)
print("Average Precision (AP):", ap)
print("One-error (OE):", oe)

Hamming Loss (HL'): 0.7751674107142857
Macro F1-score (MaF1): 0.7277486761864922
Micro F1 (MiF1): 0.7375757181006969
Average Precision (AP): 0.6704206326291327
One-error (OE): 0.8888392857142857


In [5]:
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA

# Run PCA
pca = PCA(n_components=20)
train_dataset_pca = pca.fit_transform(train_dataset)

# Run CCA
cca = CCA(n_components=4)
train_dataset_cca = cca.fit_transform(train_dataset, train_labels)

In [6]:
X = train_dataset_pca
train = X[:int(0.3 * X.shape[0])]
test = X[int(0.3 * X.shape[0]):]
train_labels_ = train_labels[:int(0.3 * train_labels.shape[0])]
test_labels_ = train_labels[int(0.3 * train_labels.shape[0]):]

In [7]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer

knn_params = {'n_neighbors': [5, 10, ]}
xgb_params = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
best_models = {}
for i in range(train_labels.shape[1]):
    # Get the labels for the current column
    labels = train_labels[:, i]

    # Run KNN with grid search
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_params, scoring=make_scorer(f1_score, average='micro'))
    knn_grid.fit(train, train_labels_)
    best_models[f'KNN_{i}'] = knn_grid.best_estimator_

    # Run XGBoost with grid search
    xgb = XGBClassifier()
    xgb_grid = GridSearchCV(xgb, xgb_params, scoring=make_scorer(f1_score, average='micro'))
    xgb_grid.fit(train, train_labels_)
    best_models[f'XGBoost_{i}'] = xgb_grid.best_estimator_

    # Run Random Forest with grid search
    rf = RandomForestClassifier()
    rf_grid = GridSearchCV(rf, rf_params, scoring=make_scorer(f1_score, average='micro'))
    rf_grid.fit(train, train_labels_)
    best_models[f'RandomForest_{i}'] = rf_grid.best_estimator_
best_model = None
best_score = 0.0

for model_name, model in best_models.items():
    # Make predictions using the model
    predictions = model.predict(test)

    # Calculate the F1 score for each label separately
    scores = []
    for i in range(test_labels_.shape[1]):
        label_predictions = predictions[:, i]
        label_true = test_labels_[:, i]
        label_score = f1_score(label_true, label_predictions)
        scores.append(label_score)

    # Calculate the average F1 score across all labels
    score = np.mean(scores)

    # Update the best model if the score is higher
    if score > best_score:
        best_model = model_name
        best_score = score

print(f"The best model is {best_model} with an average F1 score of {best_score}")
from evaluation.eval import evaluate_metrics

hl, maf1, mif1, rl, ap, oe, cov = evaluate_metrics(test_labels_, xgb_grid.predict(test))
print("Hamming Loss (HL'):", 1 - hl)
print("Macro F1-score (MaF1):", maf1)
print("Micro F1 (MiF1):", mif1)
print("Average Precision (AP):", ap)
print("One-error (OE):", oe)

The best model is XGBoost_0 with an average F1 score of 0.7319227831352384
Hamming Loss (HL'): 0.7798549107142857
Macro F1-score (MaF1): 0.7319227831352384
Micro F1 (MiF1): 0.7444782693179609
Average Precision (AP): 0.6730030609957874
One-error (OE): 0.91875


In [11]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=20, kernel='rbf')
train_dataset_kpca = kpca.fit_transform(train_dataset)
X = train_dataset_kpca
train = X[:int(0.3 * X.shape[0])]
test = X[int(0.3 * X.shape[0]):]
train_labels_ = train_labels[:int(0.3 * train_labels.shape[0])]
test_labels_ = train_labels[int(0.3 * train_labels.shape[0]):]

In [12]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer

knn_params = {'n_neighbors': [5, 10, ]}
xgb_params = {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
best_models = {}
for i in range(train_labels.shape[1]):
    # Get the labels for the current column
    labels = train_labels[:, i]

    # Run KNN with grid search
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_params, scoring=make_scorer(f1_score, average='micro'))
    knn_grid.fit(train, train_labels_)
    best_models[f'KNN_{i}'] = knn_grid.best_estimator_

    # Run XGBoost with grid search
    xgb = XGBClassifier()
    xgb_grid = GridSearchCV(xgb, xgb_params, scoring=make_scorer(f1_score, average='micro'))
    xgb_grid.fit(train, train_labels_)
    best_models[f'XGBoost_{i}'] = xgb_grid.best_estimator_

    # Run Random Forest with grid search
    rf = RandomForestClassifier()
    rf_grid = GridSearchCV(rf, rf_params, scoring=make_scorer(f1_score, average='micro'))
    rf_grid.fit(train, train_labels_)
    best_models[f'RandomForest_{i}'] = rf_grid.best_estimator_
best_model = None
best_score = 0.0

for model_name, model in best_models.items():
    # Make predictions using the model
    predictions = model.predict(test)

    # Calculate the F1 score for each label separately
    scores = []
    for i in range(test_labels_.shape[1]):
        label_predictions = predictions[:, i]
        label_true = test_labels_[:, i]
        label_score = f1_score(label_true, label_predictions)
        scores.append(label_score)

    # Calculate the average F1 score across all labels
    score = np.mean(scores)

    # Update the best model if the score is higher
    if score > best_score:
        best_model = model_name
        best_score = score

print(f"The best model is {best_model} with an average F1 score of {best_score}")
from evaluation.eval import evaluate_metrics

hl, maf1, mif1, rl, ap, oe, cov = evaluate_metrics(test_labels_, xgb_grid.predict(test))
print("Hamming Loss (HL'):", 1 - hl)
print("Macro F1-score (MaF1):", maf1)
print("Micro F1 (MiF1):", mif1)
print("Average Precision (AP):", ap)
print("One-error (OE):", oe)

The best model is XGBoost_0 with an average F1 score of 0.7237206721858545
Hamming Loss (HL'): 0.7727678571428571
Macro F1-score (MaF1): 0.7237206721858545
Micro F1 (MiF1): 0.7367127893443683
Average Precision (AP): 0.6643653647367982
One-error (OE): 0.9457589285714286
