In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import GridSearchCV !! muss auskommentiert sein für gpu grid search
from sklearn.metrics import classification_report, confusion_matrix
# für GPU ausführung:
import cudf
import cupy as cp
import gc
from cuml.neighbors import KNeighborsClassifier as cuKNN
from cuml.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from cuml.preprocessing import StandardScaler as cuStandardScaler
from sklearn.preprocessing import LabelEncoder
import itertools
from cuml import metrics

In [None]:
meta_train = pd.read_csv("/content/drive/MyDrive/bv-ss25-data/y_train.csv")
meta_val = pd.read_csv("/content/drive/MyDrive/bv-ss25-data/y_val.csv")
meta_test = pd.read_csv("/content/drive/MyDrive/bv-ss25-data/y_test.csv")

embeddings_train =  np.load("/content/drive/MyDrive/bv-ss25-data/emb_train.npy")  # shape (N, d)
embeddings_val =  np.load("/content/drive/MyDrive/bv-ss25-data/emb_val.npy")  # shape (N, d)
embeddings_test =  np.load("/content/drive/MyDrive/bv-ss25-data/emb_test.npy")  # shape (N, d)

In [None]:
for i in range(embeddings_train.shape[1]):
  meta_train[f"feat_{i}"] = embeddings_train[:, i]


for i in range(embeddings_val.shape[1]):
  meta_val[f"feat_{i}"] = embeddings_val[:, i]


for i in range(embeddings_test.shape[1]):
  meta_test[f"feat_{i}"] = embeddings_test[:, i]

In [None]:
# Embeddings direkt verwenden (und gleich in float32 casten, um den Speicher zu halbieren)

# Trainingsdaten
X_train = embeddings_train.astype(np.float32)
y_train = meta_train["country"].values

# Validierungsdaten
X_val = embeddings_val.astype(np.float32)
y_val = meta_val["country"].values

# Testdaten
X_test = embeddings_test.astype(np.float32)
y_test = meta_test["country"].values


# Freigeben ungenutzer objekte -> mehr ram
del meta_train, meta_val, meta_test
del embeddings_train, embeddings_val, embeddings_test


In [None]:

# 1. Labels encodieren
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)   # NumPy-Array int64
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

# 2. In int32 casten (passt besser zu GPU)
y_train_enc = y_train_enc.astype(np.int32)
y_val_enc   = y_val_enc.astype(np.int32)
y_test_enc  = y_test_enc.astype(np.int32)

In [None]:
# CPU
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# Auf GPU geht nicht wegen RAM, deshalb hier mit CPU und danach auf GPU
# Freigeben ungenutzer objekte -> mehr ram
del X_train, X_val, X_test

# aus float32-NumPy → float32-CuPy
X_train_gpu = cp.asarray(X_train_s)
y_train_gpu = cp.asarray(y_train_enc)

X_val_gpu   = cp.asarray(X_val_s)
y_val_gpu   = cp.asarray(y_val_enc)

X_test_gpu  = cp.asarray(X_test_s)
y_test_gpu  = cp.asarray(y_test_enc)

In [None]:
# 1. Rapids/cuML in Colab installieren
# -------------------------------------
# (Muss nur einmal pro Session laufen)
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/install_rapids.sh stable

In [None]:
# zusammenschmeißen von Train+Val
# Labels und Features zusammenstacken
X_full_train = cp.concatenate([X_train_gpu, X_val_gpu], axis=0)
y_full_train = cp.concatenate([y_train_gpu,      y_val_gpu],      axis=0)

# Speicher freigeben
del X_train_gpu, y_train_gpu, X_val_gpu, y_val_gpu
cp._default_memory_pool.free_all_blocks()


In [None]:
# baseline model evaluation auf testset
knn_base = cuKNN(
    n_neighbors=5,
    weights="uniform",           # gleiches Gewicht für alle Nachbarn
    metric="euclidean"           # euklidischer Abstand
)

knn_base.fit(X_full_train, y_full_train)
y_pred_base = knn_base.predict(X_test_gpu)

val_pred = val_pred_base.get().astype(int)

# Speicher frei machen
del val_pred_base


# Konvertieren von Zahlen zurück zu Labels (encoding)
val_pred = le.inverse_transform(val_pred)


print("=== Validation Classification Report ===")
print(classification_report(y_val, val_pred, zero_division=0))

print("=== Validation Confusion Matrix ===")
print(confusion_matrix(y_val, val_pred))

del val_pred

In [None]:
# tuned model evaluation
final_knn = cuKNN(
    n_neighbors=9,
    weights="uniform",           # gleiches Gewicht für alle Nachbarn
    metric="euclidean"           # euklidischer Abstand
)

final_knn.fit(X_full_train, y_full_train)
y_pred_final = final_knn.predict(X_test_gpu)

val_pred = val_pred_final.get().astype(int)

# Speicher frei machen
del val_pred_final


# Konvertieren von Zahlen zurück zu Labels (encoding)
val_pred = le.inverse_transform(val_pred)


print("=== Validation Classification Report ===")
print(classification_report(y_val, val_pred, zero_division=0))

print("=== Validation Confusion Matrix ===")
print(confusion_matrix(y_val, val_pred))

In [None]:
# zusatzinfos für eval

# baseline
print("=== Baseline Report ===")

# GPU-Wahrscheinlichkeiten
proba_gpu_base = knn_base.predict_proba(X_test_gpu)

# zu NumPy zurückholen
proba = proba_gpu.get()        # shape (n_samples, n_classes), dtype float32

y_test_base = le.transform(y_test)     # NumPy int-Array



In [None]:
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import log_loss

# Top-5 Accuracy: Label muss in den 5 größten Wahrscheinlichkeiten sein
top5 = top_k_accuracy_score(y_test_base, proba, k=5)
print(f"Top-5 Accuracy on Test-Set: {top5:.4f}")

# log_loss erwartet probas und integer-Labels
ll = log_loss(y_test_base, proba)
print(f"Log-Loss on Test-Set: {ll:.4f}")

In [None]:

# B Modell mit den besten Parametern darauf trainieren

print("=== Tuned Model Report ===")

# 2.1 GPU-Probas berechnen
proba_gpu = final_knn.predict_proba(X_test_gpu)

# 2.2 In NumPy zurückholen
proba = proba_gpu.get()               # shape (n_samples, n_classes)

# 2.3 True-Labels in Encoded-Form (int) bereithalten
y_test_enc = le.transform(y_test)     # NumPy int-Array



In [None]:
# Top-5 Accuracy: Label muss in den 5 größten Wahrscheinlichkeiten sein
top5 = top_k_accuracy_score(y_test_enc, proba, k=5)
print(f"Top-5 Accuracy on Test-Set: {top5:.4f}")

# log_loss erwartet probas und integer-Labels
ll = log_loss(y_test_enc, proba)
print(f"Log-Loss on Test-Set: {ll:.4f}")