In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
import os
import copy

class BroadLearningSystem:
    def __init__(self, n_mapped_features=100, n_enhancement_nodes=100):
        self.n_mapped_features = n_mapped_features
        self.n_enhancement_nodes = n_enhancement_nodes
        self.W_mapped = None
        self.W_enhance = None

    def relu(self, x):
        return np.maximum(0, x)

    def fit_transform(self, X):
        n_samples, n_features = X.shape
        if self.W_mapped is None:
            self.W_mapped = np.random.randn(n_features, self.n_mapped_features) * 0.05
        Z = self.relu(np.dot(X, self.W_mapped))
        if self.W_enhance is None:
            self.W_enhance = np.random.randn(self.n_mapped_features, self.n_enhancement_nodes) * 0.05
        H = self.relu(np.dot(Z, self.W_enhance))
        return np.hstack([X, Z, H])

    def transform(self, X):
        if self.W_mapped is None or self.W_enhance is None:
            raise ValueError("Bobot BLS belum diinisialisasi. Panggil fit_transform terlebih dahulu.")
        Z = self.relu(np.dot(X, self.W_mapped))
        H = self.relu(np.dot(Z, self.W_enhance))
        return np.hstack([X, Z, H])

def load_datasets(dataset_ids=[0, 1, 2, 3, 4]):
    datasets = []
    for i in dataset_ids:
        file_path = f'dataset_{i}.csv'
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File dataset {file_path} tidak ditemukan.")
        df = pd.read_csv(file_path)
        datasets.append(df)
    return datasets

def clean_data(df):
    df = df.copy()
    df = df[(df['ipk_sekarang'] >= 0) & (df['ipk_sekarang'] <= 4)]
    for col in ['nilai_mtk_sma', 'nilai_ipa_sma', 'nilai_fisika_sma', 'nilai_bahasa_indonesia_sma', 'nilai_bahasa_inggris_sma']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[(df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)]
    return df

class FederatedLearning:
    def __init__(self, datasets):
        self.datasets = [clean_data(df) for df in datasets]
        self.global_model = OneVsRestClassifier(XGBClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42, eval_metric='mlogloss',
            reg_alpha=0.1, reg_lambda=1.0
        ))
        self.local_models = [OneVsRestClassifier(XGBClassifier(
            n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, eval_metric='mlogloss',
            reg_alpha=0.1, reg_lambda=1.0
        )) for _ in range(len(datasets))]
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.bls = BroadLearningSystem(n_mapped_features=100, n_enhancement_nodes=100)
        all_X, all_y = [], []
        for df in self.datasets:
            X, y = self._prepare_features(df)
            all_X.append(X)
            all_y.append(y)
        X_combined = pd.concat(all_X, ignore_index=True)
        y_combined = np.hstack(all_y)
        self.scaler.fit(X_combined)
        self.bls.fit_transform(self.scaler.transform(X_combined))
        self.label_encoder.fit(y_combined)
        X_first, y_first = self.preprocess_data(self.datasets[0])
        self.global_model.fit(X_first, y_first)

    def _prepare_features(self, df):
        df = df.copy()
        df['avg_nilai_sma'] = df[['nilai_mtk_sma', 'nilai_ipa_sma', 'nilai_fisika_sma',
                                  'nilai_bahasa_indonesia_sma', 'nilai_bahasa_inggris_sma']].mean(axis=1)
        df['mtk_ipa_interaction'] = df['nilai_mtk_sma'] * df['nilai_ipa_sma']
        df['science_score'] = df[['nilai_ipa_sma', 'nilai_fisika_sma']].mean(axis=1)
        df['language_score'] = df[['nilai_bahasa_indonesia_sma', 'nilai_bahasa_inggris_sma']].mean(axis=1)
        df['math_science_diff'] = df['nilai_mtk_sma'] - df['science_score']
        df['mtk_fisika_ratio'] = df['nilai_mtk_sma'] / (df['nilai_fisika_sma'] + 1e-6)
        df['language_science_diff'] = df['language_score'] - df['science_score']
        df['ipk_semester_interaction'] = df['ipk_sekarang'] * df['semester_sekarang']
        df['language_ratio'] = df['nilai_bahasa_inggris_sma'] / (df['nilai_bahasa_indonesia_sma'] + 1e-6)
        df['mtk_quadratic'] = df['nilai_mtk_sma'] ** 2
        df['science_ipk_interaction'] = df['science_score'] * df['ipk_sekarang']
        X = df[['nilai_mtk_sma', 'nilai_ipa_sma', 'nilai_fisika_sma',
                'nilai_bahasa_indonesia_sma', 'nilai_bahasa_inggris_sma',
                'semester_sekarang', 'ipk_sekarang', 'avg_nilai_sma',
                'mtk_ipa_interaction', 'science_score', 'language_score',
                'math_science_diff', 'mtk_fisika_ratio', 'language_science_diff',
                'ipk_semester_interaction', 'language_ratio', 'mtk_quadratic',
                'science_ipk_interaction']]
        y = df['jurusan_kuliah_sekarang']
        return X, y

    def preprocess_data(self, df):
        X, y = self._prepare_features(df)
        X_scaled = self.scaler.transform(X)
        X_enhanced = self.bls.transform(X_scaled)
        y_encoded = self.label_encoder.transform(y)
        return X_enhanced, y_encoded

    def train_local_model(self, X, y, client_id):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        model = copy.deepcopy(self.local_models[client_id])
        param_grid = {
            'estimator__n_estimators': [100, 200],
            'estimator__max_depth': [3, 5, 7],
            'estimator__learning_rate': [0.01, 0.1]
        }
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
        for train_idx, _ in kf.split(X):
            X_train, y_train = X[train_idx], y[train_idx]
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
        return model

    def get_top_k_predictions(self, model, X, k):
        probas = model.predict_proba(X)
        top_k_indices = np.argsort(probas, axis=1)[:, -k:]
        return top_k_indices

    def evaluate_model(self, model, X, y, dataset_id, model_type, client_id=None, round_num=None):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        metrics = {k: {'train_acc': [], 'test_acc': [], 'train_prec': [], 'test_prec': [],
                       'train_rec': [], 'test_rec': [], 'train_f1': [], 'test_f1': []} for k in range(1, 4)}
        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            cm_train = confusion_matrix(y_train, y_train_pred, labels=np.arange(len(self.label_encoder.classes_)))
            cm_test = confusion_matrix(y_test, y_test_pred, labels=np.arange(len(self.label_encoder.classes_)))
            if round_num is not None:
                if client_id is not None:
                    print(f"\nEvaluasi Model {model_type} dari Klien {client_id} pada Dataset {dataset_id} (Ronde {round_num}, Fold {fold_idx + 1}):")
                else:
                    print(f"\nEvaluasi Model {model_type} pada Dataset {dataset_id} (Ronde {round_num}, Fold {fold_idx + 1}):")
            else:
                if client_id is not None:
                    print(f"\nEvaluasi Model {model_type} dari Klien {client_id} pada Dataset {dataset_id} (Fold {fold_idx + 1}):")
                else:
                    print(f"\nEvaluasi Model {model_type} pada Dataset {dataset_id} (Fold {fold_idx + 1}):")
            print("\nConfusion Matrix (Data Pelatihan):")
            print(cm_train)
            print("\nConfusion Matrix (Data Pengujian):")
            print(cm_test)
            for k in range(1, 4):
                train_top_k = self.get_top_k_predictions(model, X_train, k)
                test_top_k = self.get_top_k_predictions(model, X_test, k)
                train_correct = [y_train[i] in train_top_k[i] for i in range(len(y_train))]
                test_correct = [y_test[i] in test_top_k[i] for i in range(len(y_test))]
                train_accuracy = np.mean(train_correct)
                test_accuracy = np.mean(test_correct)
                train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(
                    train_correct, [1] * len(train_correct), average='binary', zero_division=0
                )
                test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
                    test_correct, [1] * len(test_correct), average='binary', zero_division=0
                )
                metrics[k]['train_acc'].append(train_accuracy)
                metrics[k]['test_acc'].append(test_accuracy)
                metrics[k]['train_prec'].append(train_precision)
                metrics[k]['test_prec'].append(test_precision)
                metrics[k]['train_rec'].append(train_recall)
                metrics[k]['test_rec'].append(test_recall)
                metrics[k]['train_f1'].append(train_f1)
                metrics[k]['test_f1'].append(test_f1)
                print(f"\nEvaluasi Top-{k}:")
                print(f"Evaluasi Akurasi ke Data Pelatihan (Top-{k}): {train_accuracy:.4f}")
                print(f"Evaluasi Akurasi ke Data Pengujian (Top-{k}): {test_accuracy:.4f}")
                print(f"Evaluasi Presisi ke Data Pelatihan (Top-{k}): {train_precision:.4f}")
                print(f"Evaluasi Presisi ke Data Pengujian (Top-{k}): {test_precision:.4f}")
                print(f"Evaluasi Recall ke Data Pelatihan (Top-{k}): {train_recall:.4f}")
                print(f"Evaluasi Recall ke Data Pengujian (Top-{k}): {test_recall:.4f}")
                print(f"Evaluasi F1-Score ke Data Pelatihan (Top-{k}): {train_f1:.4f}")
                print(f"Evaluasi F1-Score ke Data Pengujian (Top-{k}): {test_f1:.4f}")
        for k in range(1, 4):
            print(f"\nRata-rata Evaluasi Model {model_type} pada Dataset {dataset_id} (Semua Fold, Top-{k}):")
            print(f"Rata-rata Evaluasi Akurasi ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_acc']):.4f}")
            print(f"Rata-rata Evaluasi Akurasi ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_acc']):.4f}")
            print(f"Rata-rata Evaluasi Presisi ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_prec']):.4f}")
            print(f"Rata-rata Evaluasi Presisi ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_prec']):.4f}")
            print(f"Rata-rata Evaluasi Recall ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_rec']):.4f}")
            print(f"Rata-rata Evaluasi Recall ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_rec']):.4f}")
            print(f"Rata-rata Evaluasi F1-Score ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_f1']):.4f}")
            print(f"Rata-rata Evaluasi F1-Score ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_f1']):.4f}")
        return (
            [np.mean(metrics[k]['train_acc']) for k in range(1, 4)],
            [np.mean(metrics[k]['test_acc']) for k in range(1, 4)],
            [np.mean(metrics[k]['train_prec']) for k in range(1, 4)],
            [np.mean(metrics[k]['test_prec']) for k in range(1, 4)],
            [np.mean(metrics[k]['train_rec']) for k in range(1, 4)],
            [np.mean(metrics[k]['test_rec']) for k in range(1, 4)],
            [np.mean(metrics[k]['train_f1']) for k in range(1, 4)],
            [np.mean(metrics[k]['test_f1']) for k in range(1, 4)]
        )

    def aggregate_models(self, local_models, dataset_sizes, local_f1s):
        total_samples = sum(dataset_sizes)
        weights = [f1 / sum(local_f1s) for f1 in local_f1s]
        global_estimators = []
        for model, weight in zip(local_models, weights):
            n_estimators = int(weight * len(model.estimators_))
            for est_idx, estimator in enumerate(model.estimators_[:n_estimators]):
                global_estimators.append(copy.deepcopy(estimator))
        n_estimators = len(self.global_model.estimators_)
        selected_estimators = global_estimators[:n_estimators]
        for idx, est in enumerate(selected_estimators):
            self.global_model.estimators_[idx] = est
        X_first, y_first = self.preprocess_data(self.datasets[0])
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        for train_idx, _ in kf.split(X_first):
            X_subset, y_subset = X_first[train_idx], y_first[train_idx]
            self.global_model.fit(X_subset, y_subset)
            break
        for i, local_model in enumerate(local_models):
            for est_idx, est in enumerate(local_model.estimators_):
                for global_est_idx, global_est in enumerate(self.global_model.estimators_):
                    for param in est.get_params():
                        if hasattr(global_est, param):
                            global_param = getattr(global_est, param)
                            local_param = getattr(est, param)
                            if isinstance(global_param, np.ndarray):
                                setattr(global_est, param, global_param + 0.05 * weights[i] * (local_param - global_param))
        return self.global_model

    def federated_training(self, n_rounds=5):
        round_summaries = []
        for round in range(n_rounds):
            print(f"\n=== Ronde {round + 1} ===")
            local_models = []
            local_test_f1s_all = {i: [] for i in range(len(self.datasets))}
            local_f1s = []
            dataset_sizes = []
            for i, df in enumerate(self.datasets):
                X, y = self.preprocess_data(df)
                local_model = self.train_local_model(X, y, i)
                local_models.append(local_model)
                dataset_sizes.append(len(df))
                _, _, _, _, _, _, _, test_f1 = self.evaluate_model(
                    local_model, X, y, i, "Lokal", client_id=i, round_num=round + 1
                )
                local_test_f1s_all[i].append((i, test_f1[2]))  
                local_f1s.append(test_f1[2])  
            for i, local_model in enumerate(local_models):
                for j, df in enumerate(self.datasets):
                    if i != j:
                        X, y = self.preprocess_data(df)
                        _, _, _, _, _, _, _, test_f1 = self.evaluate_model(
                            local_model, X, y, j, "Lokal", client_id=i, round_num=round + 1
                        )
                        local_test_f1s_all[i].append((j, test_f1[2]))  
            self.global_model = self.aggregate_models(local_models, dataset_sizes, local_f1s)
            global_test_f1s = []
            for i, df in enumerate(self.datasets):
                X, y = self.preprocess_data(df)
                _, _, _, _, _, _, _, test_f1 = self.evaluate_model(
                    self.global_model, X, y, i, "Global", round_num=round + 1
                )
                global_test_f1s.append(test_f1[2])  
            avg_local_f1s = []
            for i in range(len(self.datasets)):
                f1_scores = [f1 for _, f1 in local_test_f1s_all[i]]
                avg_f1 = np.mean(f1_scores)
                avg_local_f1s.append((i, avg_f1))
                print(f"\nRata-rata Evaluasi F1-Score Model Lokal Klien {i} di Semua Dataset (Top-3): {avg_f1:.4f}")
            avg_global_f1 = np.mean(global_test_f1s)
            print(f"\nRata-rata Evaluasi F1-Score Model Global di Semua Dataset (Top-3): {avg_global_f1:.4f}")
            best_local_f1 = max(avg_local_f1s, key=lambda x: x[1], default=(None, 0))
            best_client, best_f1 = best_local_f1
            if best_f1 > avg_global_f1:
                print(f"\nModel Lokal Klien {best_client} (F1: {best_f1:.4f}) lebih baik dari Model Global (F1: {avg_global_f1:.4f}). Mengganti Model Global.")
                self.global_model = copy.deepcopy(local_models[best_client])
                self.local_models[best_client] = copy.deepcopy(local_models[best_client])
            else:
                print(f"\nModel Global tetap digunakan (F1: {avg_global_f1:.4f}).")
            round_summaries.append({
                'round': round + 1,
                'avg_local_f1s': avg_local_f1s,
                'avg_global_f1': avg_global_f1,
                'best_client': best_client if best_f1 > avg_global_f1 else None
            })
        print("\n=== Ringkasan Akhir ===")
        for summary in round_summaries:
            print(f"\nRonde {summary['round']}:")
            for client_id, avg_f1 in summary['avg_local_f1s']:
                print(f"  Rata-rata Evaluasi F1-Score Model Lokal Klien {client_id} (Top-3): {avg_f1:.4f}")
            print(f"  Rata-rata Evaluasi F1-Score Model Global (Top-3): {summary['avg_global_f1']:.4f}")
            if summary['best_client'] is not None:
                print(f"  Model Global diganti dengan Model Lokal Klien {summary['best_client']}")

    def evaluate_final_models(self):
        metrics = {k: {'train_acc': [], 'test_acc': [], 'train_prec': [], 'test_prec': [],
                       'train_rec': [], 'test_rec': [], 'train_f1': [], 'test_f1': []} for k in range(1, 4)}
        print("\n=== Evaluasi Akhir Model Global ===")
        for i, df in enumerate(self.datasets):
            X, y = self.preprocess_data(df)
            train_acc, test_acc, train_prec, test_prec, train_rec, test_rec, train_f1, test_f1 = self.evaluate_model(
                self.global_model, X, y, i, "Global"
            )
            for k in range(1, 4):
                metrics[k]['train_acc'].append(train_acc[k-1])
                metrics[k]['test_acc'].append(test_acc[k-1])
                metrics[k]['train_prec'].append(train_prec[k-1])
                metrics[k]['test_prec'].append(test_prec[k-1])
                metrics[k]['train_rec'].append(train_rec[k-1])
                metrics[k]['test_rec'].append(test_rec[k-1])
                metrics[k]['train_f1'].append(train_f1[k-1])
                metrics[k]['test_f1'].append(test_f1[k-1])
            df_copy = df.copy()
            top3_indices = self.get_top_k_predictions(self.global_model, X, 3)
            top3_labels = [self.label_encoder.inverse_transform(indices) for indices in top3_indices]
            df_copy['predicted_jurusan_top1'] = [labels[2] for labels in top3_labels]
            df_copy['predicted_jurusan_top2'] = [labels[1] for labels in top3_labels]
            df_copy['predicted_jurusan_top3'] = [labels[0] for labels in top3_labels]
            df_copy.to_csv(f'predictions_client_{i}.csv', index=False)
        print(f"\nRingkasan Akhir untuk Dataset:")
        for k in range(1, 4):
            print(f"\nTop-{k}:")
            print(f"Rata-rata Evaluasi Akurasi ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_acc']):.4f}")
            print(f"Rata-rata Evaluasi Akurasi ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_acc']):.4f}")
            print(f"Rata-rata Evaluasi Presisi ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_prec']):.4f}")
            print(f"Rata-rata Evaluasi Presisi ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_prec']):.4f}")
            print(f"Rata-rata Evaluasi Recall ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_rec']):.4f}")
            print(f"Rata-rata Evaluasi Recall ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_rec']):.4f}")
            print(f"Rata-rata Evaluasi F1-Score ke Data Pelatihan (Top-{k}): {np.mean(metrics[k]['train_f1']):.4f}")
            print(f"Rata-rata Evaluasi F1-Score ke Data Pengujian (Top-{k}): {np.mean(metrics[k]['test_f1']):.4f}")
        return metrics[3]['test_acc']

dataset_sizes = [180, 186, 192, 198, 204]
datasets = load_datasets()
fl = FederatedLearning(datasets)
fl.federated_training(n_rounds=1)
accuracies = fl.evaluate_final_models()


=== Ronde 1 ===

Evaluasi Model Lokal dari Klien 0 pada Dataset 0 (Ronde 1, Fold 1):

Confusion Matrix (Data Pelatihan):
[[ 9  0  0  0  2  0  0  0  0  1]
 [ 0 13  2  0  0  0  0  2  0  0]
 [ 0  0 12  0  1  0  0  0  0  1]
 [ 0  0  0 10  0  0  0  0  1  1]
 [ 0  0  0  0 14  0  2  0  0  1]
 [ 0  0  1  0  0 12  0  0  1  0]
 [ 0  0  1  2  0  0  9  0  0  1]
 [ 0  0  0  0  0  0  0 10  2  0]
 [ 0  0  0  0  0  3  0  0 11  0]
 [ 0  0  0  0  1  1  0  0  0 13]]

Confusion Matrix (Data Pengujian):
[[1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0 0 0 0]
 [0 0 0 6 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 5 0 0 0]
 [0 0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 0 4 0]
 [0 0 0 0 0 0 0 0 0 3]]

Evaluasi Top-1:
Evaluasi Akurasi ke Data Pelatihan (Top-1): 0.8071
Evaluasi Akurasi ke Data Pengujian (Top-1): 1.0000
Evaluasi Presisi ke Data Pelatihan (Top-1): 0.8071
Evaluasi Presisi ke Data Pengujian (Top-1): 1.0000
Evaluasi Recall ke Data Pelatihan (Top-1): 1.0000
Evaluasi 