### Steps of the Project:
- Import libraries
- Extract the data and form dataframes
- Try three models
- Hyperparameter tuning using Optuna on best performing model


In [3]:
import pandas as pd
import numpy as np
import random as r
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

r.seed(1)
np.random.seed(1)

# Load train_feats.npy and train_labels.csv
train_feats = np.load('train_feats.npy', allow_pickle=True).item()
train_labels = pd.read_csv('train_labels.csv')

# Prepare the training DataFrame
final_train_df = pd.DataFrame({
    'idx': train_feats['idx'],
    'resnet_feature': list(train_feats['resnet_feature']),
    'vit_feature': list(train_feats['vit_feature']),
    'clip_feature': list(train_feats['clip_feature']),
    'dino_feature': list(train_feats['dino_feature']),
    'label': train_labels['label']  # Add ground truth labels
})

# Check the prepared training DataFrame
print("Final Training DataFrame Shape:", final_train_df.shape)
print(final_train_df.head())

Final Training DataFrame Shape: (40000, 6)
   idx                                     resnet_feature  \
0    0  [0.882933, 1.7576884, 0.037083052, 0.080605745...   
1    1  [1.856296, 0.08484737, 1.9896222, 0.37220904, ...   
2    2  [1.0885917, 0.9853252, 1.2479072, 0.85630983, ...   
3    3  [0.46679923, 0.59049946, 1.0448841, 0.43161052...   
4    4  [1.6842757, 4.21578, 0.45899037, 0.30567297, 0...   

                                         vit_feature  \
0  [0.39251244, -0.7359995, -0.40713686, 0.384909...   
1  [0.536756, 0.20046183, 0.53705513, -1.4450091,...   
2  [0.18865782, 0.40380907, 0.08730512, 1.3314861...   
3  [0.9024208, 1.004203, 0.62072945, 0.40194172, ...   
4  [-1.0811086, 0.018160842, 0.40104544, -0.41978...   

                                        clip_feature  \
0  [0.36834767, -0.0048528686, -0.54004794, -0.42...   
1  [0.21078074, -0.23845242, -0.23498438, -0.0415...   
2  [0.23849042, -0.17168012, -0.33907497, -0.1153...   
3  [0.27193326, 0.05075098, -

In [4]:
# Load valtest_feats.npy
valtest_feats = np.load('valtest_feats.npy', allow_pickle=True).item()

# Prepare the validation-test DataFrame
valtest_df = pd.DataFrame({
    'idx': valtest_feats['idx'],
    'resnet_feature': list(valtest_feats['resnet_feature']),
    'vit_feature': list(valtest_feats['vit_feature']),
    'clip_feature': list(valtest_feats['clip_feature']),
    'dino_feature': list(valtest_feats['dino_feature'])
})

# Check the validation-test DataFrame
print("Validation-Test DataFrame Shape:", valtest_df.shape)
print(valtest_df.head())

Validation-Test DataFrame Shape: (20000, 5)
   idx                                     resnet_feature  \
0    0  [1.8273036, 0.63109094, 0.23980471, 1.7095062,...   
1    1  [0.90152156, 0.46560124, 0.4799108, 1.1381094,...   
2    2  [1.0099123, 1.7757976, 0.07629461, 0.5221749, ...   
3    3  [0.010587645, 3.5005102, 0.01937968, 0.7267904...   
4    4  [0.47045803, 0.45250612, 0.115028925, 0.298091...   

                                         vit_feature  \
0  [1.3817077, 0.87259007, 0.04867079, 1.398396, ...   
1  [-0.35124308, -0.19813143, -0.54856735, 0.1956...   
2  [1.0760875, 0.682714, 0.40147147, 0.58281916, ...   
3  [0.3766103, 0.37389246, 0.256509, 1.0673461, -...   
4  [0.9422639, 0.74399835, -0.21973425, 0.1927124...   

                                        clip_feature  \
0  [0.29609913, 0.20632589, -0.30062598, -0.34170...   
1  [-0.020611845, -0.19302031, 0.122885466, -0.40...   
2  [0.47820017, -0.047153465, 0.107344255, 0.1043...   
3  [0.18480419, -0.14510047,

In [7]:
print(final_train_df.isnull().sum())  # Check for missing values
print(valtest_df.isnull().sum())
print("ResNet Shape:", np.shape(train_feats['resnet_feature']))
print("ViT Shape:", np.shape(train_feats['vit_feature']))
print("CLIP Shape:", np.shape(train_feats['clip_feature']))
print("DiNO Shape:", np.shape(train_feats['dino_feature']))

idx               0
resnet_feature    0
vit_feature       0
clip_feature      0
dino_feature      0
label             0
dtype: int64
idx               0
resnet_feature    0
vit_feature       0
clip_feature      0
dino_feature      0
dtype: int64
ResNet Shape: (40000, 512)
ViT Shape: (40000, 768)
CLIP Shape: (40000, 512)
DiNO Shape: (40000, 768)


### MODEL 1 : XGBoost Classifier

In [46]:
# Function to perform 5-fold cross-validation for XGBoost
def cross_validate_xgboost(train_data, model_params):
    """
    Perform 5-fold cross-validation using XGBoost.

    Parameters:
    - train_data: pandas DataFrame, contains features and labels
    - model_params: dict, parameters for the XGBoost model

    Returns:
    - average_f1: float, average macro F1 score across folds
    - predictions: list, all predictions from 5 folds
    """
    X = np.hstack([
        np.stack(train_data['resnet_feature']),
        np.stack(train_data['vit_feature']),
        np.stack(train_data['clip_feature']),
        np.stack(train_data['dino_feature'])
    ])
    y = train_data['label']

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    f1_scores = []
    all_predictions = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Train XGBoost model
        model = xgb.XGBClassifier(
            **model_params,
            objective='multi:softmax',
            num_class=len(np.unique(y)),
            random_state=42,
        )
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)

        # Calculate F1 score
        f1 = f1_score(y_val, predictions, average='macro')
        f1_scores.append(f1)
        all_predictions.extend(predictions)

    average_f1 = np.mean(f1_scores)
    return average_f1, all_predictions

In [48]:
 # XGBoost parameters
xgb_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Perform 5-fold CV for XGBoost
avg_f1_xgb, preds_xgb = cross_validate_xgboost(final_train_df, xgb_params)
print(f"Average F1 Score for XGBoost: {avg_f1_xgb:.4f}")

Average F1 Score for XGBoost: 0.9814


### MODEL 2: LightGBM

In [57]:
# Function to perform 5-fold cross-validation for LightGBM
def cross_validate_lightgbm(train_data, model_params):
    """
    Perform 5-fold cross-validation using LightGBM.

    Parameters:
    - train_data: pandas DataFrame, contains features and labels
    - model_params: dict, parameters for the LightGBM model

    Returns:
    - average_f1: float, average macro F1 score across folds
    - predictions: list, all predictions from 5 folds
    """
    X = np.hstack([
        np.stack(train_data['resnet_feature']),
        np.stack(train_data['vit_feature']),
        np.stack(train_data['clip_feature']),
        np.stack(train_data['dino_feature'])
    ])
    y = train_data['label']

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    f1_scores = []
    all_predictions = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Train LightGBM model
        train_data_lgb = lgb.Dataset(X_train, label=y_train)
        val_data_lgb = lgb.Dataset(X_val, label=y_val, reference=train_data_lgb)

        model = lgb.train(
            model_params,
            train_data_lgb,
            valid_sets=[val_data_lgb],
        )

        # Predict on validation set
        predictions = model.predict(X_val)
        predictions = np.argmax(predictions, axis=1)  # Convert probabilities to class labels

        # Calculate F1 score
        f1 = f1_score(y_val, predictions, average='macro')
        f1_scores.append(f1)
        all_predictions.extend(predictions)

    average_f1 = np.mean(f1_scores)
    return average_f1, all_predictions

In [59]:
 # Load prepared training data
train_feats = np.load('train_feats.npy', allow_pickle=True).item()
train_labels = pd.read_csv('train_labels.csv')

final_train_df = pd.DataFrame({
    'idx': train_feats['idx'],
    'resnet_feature': list(train_feats['resnet_feature']),
    'vit_feature': list(train_feats['vit_feature']),
    'clip_feature': list(train_feats['clip_feature']),
    'dino_feature': list(train_feats['dino_feature']),
    'label': train_labels['label']  # Add ground truth labels
})

# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(final_train_df['label'])),
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'max_depth': -1,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 1
}

# Perform 5-fold CV for LightGBM
avg_f1_lgb, preds_lgb = cross_validate_lightgbm(final_train_df, lgb_params)
print(f"Average F1 Score for LightGBM: {avg_f1_lgb:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.272470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652800
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 2560
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.280833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[Ligh

### MODEL 3: Multi Layer Perceptron

In [11]:
# Extract features and labels
X_train = np.hstack([
    np.stack(final_train_df['resnet_feature']),
    np.stack(final_train_df['vit_feature']),
    np.stack(final_train_df['clip_feature']),
    np.stack(final_train_df['dino_feature'])
])
y_train = final_train_df['label'].values

X_test = np.hstack([
    np.stack(valtest_df['resnet_feature']),
    np.stack(valtest_df['vit_feature']),
    np.stack(valtest_df['clip_feature']),
    np.stack(valtest_df['dino_feature'])
])

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1, stratify=y_train
)

def objective(trial):
    
    hidden_layer_sizes = tuple([trial.suggest_int('hidden_size', 64, 512) for _ in range(trial.suggest_int('num_layers', 1, 3))])
    learning_rate_init = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])

    mlp = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        learning_rate_init=learning_rate_init,
        batch_size=batch_size,
        random_state=1
    )

    mlp.fit(X_train_split, y_train_split)
    y_val_pred = mlp.predict(X_val_split)
    f1 = f1_score(y_val_split, y_val_pred, average='macro')
    return f1
# Perform Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)

[I 2024-12-25 14:13:38,611] A new study created in memory with name: no-name-92874662-d1b9-4c4e-85ee-7e0ac24bbb13
[I 2024-12-25 14:14:34,057] Trial 0 finished with value: 0.9876362879143386 and parameters: {'num_layers': 3, 'hidden_size': 214, 'learning_rate': 0.00043366667467870074, 'batch_size': 128}. Best is trial 0 with value: 0.9876362879143386.
[I 2024-12-25 14:25:26,613] Trial 1 finished with value: 0.9798421563461792 and parameters: {'num_layers': 3, 'hidden_size': 445, 'learning_rate': 0.001706066207505558, 'batch_size': 16}. Best is trial 0 with value: 0.9876362879143386.
[I 2024-12-25 14:26:29,155] Trial 2 finished with value: 0.9842531764130037 and parameters: {'num_layers': 2, 'hidden_size': 479, 'learning_rate': 0.0006210374819609473, 'batch_size': 128}. Best is trial 0 with value: 0.9876362879143386.
[I 2024-12-25 14:27:45,561] Trial 3 finished with value: 0.9694701865591651 and parameters: {'num_layers': 2, 'hidden_size': 219, 'learning_rate': 0.005630892954221425, 'bat

Best hyperparameters: {'num_layers': 1, 'hidden_size': 349, 'learning_rate': 0.00013466727321237925, 'batch_size': 64}


In [19]:
# Initialize the MLPClassifier with best parameters from Optuna
best_params = study.best_params
hidden_layer_sizes = tuple([best_params['hidden_size']] * best_params['num_layers'])

mlp = MLPClassifier(
    hidden_layer_sizes=hidden_layer_sizes,
    learning_rate_init=best_params['learning_rate'],
    batch_size=best_params['batch_size'],
    random_state=1
)

In [21]:
# 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores = []

for train_idx, val_idx in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    mlp = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        learning_rate_init=best_params['learning_rate'],
        batch_size=best_params['batch_size'],
        random_state=1
    )

    mlp.fit(X_train_fold, y_train_fold)
    y_val_pred = mlp.predict(X_val_fold)
    f1 = f1_score(y_val_fold, y_val_pred, average='macro')
    f1_scores.append(f1)

print(f"Average F1 Score across 5 folds: {np.mean(f1_scores):.4f}")

Average F1 Score across 5 folds: 0.9891


### Multi-Layer Perceptron is the best performing model

In [22]:
# Train the model on whole set
mlp.fit(X_train, y_train)

# Predict on the test set
y_test_pred = mlp.predict(X_test)