# BLG 454E - Learning From Data - Fall 2024-25
## BYD Team

| Student No |          Name         |         e-mail         |
|:----------:|:---------------------:|:----------------------:|
|  150200037 | Murat Biberoğlu       | biberoglu20@itu.edu.tr |
|  150200060 | Ömer Yıldız           | yildizom20@itu.edu.tr  |
|  150210079 | Süleyman Ceyhun Demir |  demirsu21@itu.edu.tr  |


## Import Libraries

In [35]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

## Read Data

In [2]:
# Features
train_features = np.load('train_feats.npy', allow_pickle=True)
validation_features = np.load('valtest_feats.npy', allow_pickle=True)

# Extract Train Features
train_data_dict = train_features.item()
resnet_features = train_data_dict["resnet_feature"]
clip_features = train_data_dict["clip_feature"]
dino_features = train_data_dict["dino_feature"]
vit_features = train_data_dict["vit_feature"]
del train_features

# Extract Test Features
data_dict = validation_features.item()
resnet_validation_features = data_dict["resnet_feature"]
clip_validation_features = data_dict["clip_feature"]
dino_validation_features = data_dict["dino_feature"]
vit_validation_features = data_dict["vit_feature"]
del validation_features, data_dict

# Labels
labels_df = pd.read_csv('train_labels.csv')
labels_df = labels_df.rename(columns={"ID": "idx"})
labels_merged = pd.merge(pd.DataFrame(train_data_dict['idx'], columns=['idx']), labels_df, on='idx', how='left')
labels = labels_merged['label'].values
del labels_df, labels_merged, train_data_dict

# Validate
print("[TRAIN] ResNet:", resnet_features.shape)
print("[TRAIN] CLIP  :", clip_features.shape)
print("[TRAIN] DINO  :", dino_features.shape)
print("[TRAIN] ViT   :", vit_features.shape)
print("-"*32)
print("[TEST ] ResNet:", resnet_validation_features.shape)
print("[TEST ] CLIP  :", clip_validation_features.shape)
print("[TEST ] DINO  :", dino_validation_features.shape)
print("[TEST ] ViT   :", vit_validation_features.shape)
print("-"*32)
print("[TRAIN] Labels:", labels.shape)

[TRAIN] ResNet: (40000, 512)
[TRAIN] CLIP  : (40000, 512)
[TRAIN] DINO  : (40000, 768)
[TRAIN] ViT   : (40000, 768)
--------------------------------
[TEST ] ResNet: (20000, 512)
[TEST ] CLIP  : (20000, 512)
[TEST ] DINO  : (20000, 768)
[TEST ] ViT   : (20000, 768)
--------------------------------
[TRAIN] Labels: (40000,)


## Standardization

In [3]:
scaler = StandardScaler()

resnet_features = scaler.fit_transform(resnet_features)
clip_features = scaler.fit_transform(clip_features)
dino_features = scaler.fit_transform(dino_features)
vit_features = scaler.fit_transform(vit_features)
resnet_validation_features = scaler.fit_transform(resnet_validation_features)
clip_validation_features = scaler.fit_transform(clip_validation_features)
dino_validation_features = scaler.fit_transform(dino_validation_features)
vit_validation_features = scaler.fit_transform(vit_validation_features)

del scaler

## LDA

In [4]:
def perform_lda(features, labels, n_components=None):
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    reduced_features = lda.fit_transform(features, labels)
    return reduced_features, lda

In [5]:
n_components = 9
resnet_lda_features, lda_resnet = perform_lda(resnet_features, labels, n_components)
clip_lda_features, lda_clip = perform_lda(clip_features, labels, n_components)  
dino_lda_features, lda_dino = perform_lda(dino_features, labels, n_components)
vit_lda_features, lda_vit = perform_lda(vit_features, labels, n_components)

## Nystroem

In [6]:
def perform_nystroem(features, n_components=250):
    nystroem = Nystroem(kernel='rbf', n_components=n_components, random_state=42)
    reduced_features = nystroem.fit_transform(features)
    return reduced_features, nystroem

In [7]:
n_components = 27
resnet_nystroem_features, nystroem_resnet = perform_nystroem(resnet_features, n_components)
clip_nystroem_features, nystroem_clip = perform_nystroem(clip_features, n_components)
dino_nystroem_features, nystroem_dino = perform_nystroem(dino_features, n_components)
vit_nystroem_features, nystroem_vit = perform_nystroem(vit_features, n_components)

## Merge Extracted Features From Nystroem & LDA

In [8]:
lda_features_combined = np.hstack([resnet_lda_features, clip_lda_features, dino_lda_features, vit_lda_features])
lda_features_combined.shape

(40000, 36)

In [9]:
nystroem_features_combined = np.hstack([resnet_nystroem_features, clip_nystroem_features, dino_nystroem_features, vit_nystroem_features])
nystroem_features_combined.shape}

Shape of combined features: (40000, 108)


In [10]:
combined_features = np.hstack([lda_features_combined, nystroem_features_combined])
combined_features.shape

(40000, 144)

## Training & Evaluating

In [20]:
def train_and_evaluate(model_constructor, features, labels, **kwargs):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    classifier = model_constructor(**kwargs)
    classifier.fit(X_train, y_train)

    model_name = model_constructor.__name__.ljust(24)[:24]
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} F1-Score: {f1:.4f}")

### Just RandomForestClassifier

In [21]:
train_and_evaluate(RandomForestClassifier, combined_features, labels, n_estimators=300, max_depth=30, n_jobs=-1)

RandomForestClassifier   Accuracy: 0.9880
RandomForestClassifier   F1-Score: 0.9880


### Just KNeighborsClassifier

In [22]:
train_and_evaluate(KNeighborsClassifier, combined_features, labels, n_neighbors=19)

KNeighborsClassifier     Accuracy: 0.9894
KNeighborsClassifier     F1-Score: 0.9894


### Just LinearSVC

In [23]:
train_and_evaluate(LinearSVC, combined_features, labels)

LinearSVC                Accuracy: 0.9890
LinearSVC                F1-Score: 0.9890


### Just SGDClassifier

In [24]:
train_and_evaluate(SGDClassifier, combined_features, labels, max_iter=25_000, tol=1e-8, n_jobs=-1)

SGDClassifier            Accuracy: 0.9880
SGDClassifier            F1-Score: 0.9880


### Just LogisticRegression

In [25]:
train_and_evaluate(LogisticRegression, combined_features, labels, max_iter=5_000)

LogisticRegression       Accuracy: 0.9879
LogisticRegression       F1-Score: 0.9879


### StackingClassifier
Stack **RandomForestClassifier**, **KNeighborsClassifier**, **LinearSVC** & **SGDClassifier**

In [30]:
def CustomStackingClassifier():
    base_estimators = [
        ('RF', RandomForestClassifier(n_estimators=300, max_depth=30, n_jobs=-1)),
        ('KNN', KNeighborsClassifier(n_neighbors=19)),
        ('LSVC', LinearSVC()),
        ('SGD', SGDClassifier(max_iter=5_000, tol=1e-8, n_jobs=-1))
    ]
    meta_classifier = LogisticRegression(max_iter=25_000)
    return StackingClassifier(estimators=base_estimators, final_estimator=meta_classifier)

In [29]:
train_and_evaluate(CustomStackingClassifier, combined_features, labels)

CustomStackingClassifier Accuracy: 0.9884
CustomStackingClassifier F1-Score: 0.9884


### Train Final Model

In [31]:
classifier = CustomStackingClassifier()
classifier.fit(combined_features, labels)

## Apply Dimensionality Reduction (Nystroem & LDA) To Validation Data

In [33]:
resnet_lda_validation_features = lda_resnet.transform(resnet_validation_features)
clip_lda_validation_features = lda_clip.transform(clip_validation_features)
dino_lda_validation_features = lda_dino.transform(dino_validation_features)
vit_lda_validation_features = lda_vit.transform(vit_validation_features)

resnet_nystroem_validation_features = nystroem_resnet.transform(resnet_validation_features)
clip_nystroem_validation_features = nystroem_clip.transform(clip_validation_features)
dino_nystroem_validation_features = nystroem_dino.transform(dino_validation_features)
vit_nystroem_validation_features = nystroem_vit.transform(vit_validation_features)

lda_validation_features_combined = np.hstack([resnet_lda_validation_features,
                                              clip_lda_validation_features,
                                              dino_lda_validation_features,
                                              vit_lda_validation_features])
nystroem_validation_features_combined = np.hstack([resnet_nystroem_validation_features,
                                                   clip_nystroem_validation_features,
                                                   dino_nystroem_validation_features,
                                                   vit_nystroem_validation_features])

combined_validation_features = np.hstack([lda_validation_features_combined, nystroem_validation_features_combined])
combined_validation_features.shape

(20000, 144)

## Predict Validation Data & Save Predictions To `submission.csv`

In [34]:
predicted = classifier.predict(combined_validation_features)
print("Prediction Shape:", predicted.shape)
submission = pd.DataFrame({
    'ID': np.arange(len(predicted)),
    'Predicted': predicted
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved.")

Prediction Shape: (20000,)
Submission file saved.


## 5-Fold Cross Validation

In [36]:
import random as r
r.seed(1)

In [44]:
def five_fold_cross_validation(model_constructor, X, y):
    """
    Perform 5-fold cross-validation, calculate average macro F1 score, 
    and save predictions to predictions.csv.

    Parameters:
        model_constructor (callable): Constructor for the model (e.g., lambda: SomeModel(params)).
        X (array-like): Feature matrix.
        y (array-like): Target array.

    Returns:
        float: Average F1 score (macro).
    """
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    # Prepare arrays to store predictions in original order
    predictions = np.zeros_like(y, dtype=np.int64)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Initialize and train the model
        model = model_constructor()
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Save predictions in their original positions
        predictions[test_index] = y_pred

    # Save predictions to a CSV file
    predictions_df = pd.DataFrame({
        'ID': np.arange(len(predictions)),
        'Predicted': predictions
    })
    predictions_df.to_csv("predictions.csv", index=False)

    # Return the average F1 score
    return f1_score(y, predictions, average="macro")


In [46]:
five_fold_cross_validation(CustomStackingClassifier, combined_features, labels)

0.988599887324974