## 1ST TRY WITH SOFT LABELLING and Active Learning

In [6]:
c = pd.read_parquet(r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet')

In [17]:
type(c.dense_embedding)

pandas.core.series.Series

In [12]:
c = c.rename({'embedding': 'dense_embedding'}, axis=1)

In [18]:
c.head()

Unnamed: 0,author_id,text,cleaned_text,dense_embedding
0,22,Sea levels aren't rising or anything. Amazing ...,Sea levels rising Amazing photos Venice underw...,"[-0.07484601, 0.025625845, 0.09780532, 0.05052..."
1,224,I just dumped a bucket of room temperature wat...,I dumped bucket room temperature water head cl...,"[-0.01502552, 0.026903296, 0.10912616, -0.0106..."
2,509,interested in taking action on #ClimateChange?...,interested taking action ClimateChange help fu...,"[-0.023086004, 0.025529342, 0.10543756, 0.0440..."
3,785,"""But I'm already using #5G on my AT&amp;T phon...",But I AT amp T phone Every customer duped AT a...,"[-0.045471866, 0.009822818, 0.058821313, -0.05..."
4,985,"In the latest @taprootyeg story, @stephencooke...",In latest story looks access means Edmonton re...,"[0.028260324, 0.02326092, -0.0033834244, 0.022..."


In [14]:
c.to_parquet(r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet')

In [1]:
import pandas as pd
import numpy as np
import faiss
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling
import logging
import time
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
from tqdm.notebook import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class CatBoostActiveLearningPipeline:
    def __init__(self, labeled_data_file: str, augmented_data_file: str, unlabeled_data_file: str, 
                 embedding_column: str, label_column: str, test_size: float = 0.2, 
                 initial_labeled_ratio: float = 0.3, random_state: int = 42, batch_size: int = 1000):
        self.labeled_data_file = labeled_data_file
        self.augmented_data_file = augmented_data_file
        self.unlabeled_data_file = unlabeled_data_file
        self.embedding_column = embedding_column
        self.label_column = label_column
        self.test_size = test_size
        self.initial_labeled_ratio = initial_labeled_ratio
        self.random_state = random_state
        self.batch_size = batch_size
        self.le = LabelEncoder()
        
        self._log_init_info()
        self._load_and_preprocess_data()
        
    def _log_init_info(self) -> None:
        logger.info("Initializing CatBoostActiveLearningPipeline")
        logger.info(f"Labeled data file: {self.labeled_data_file}")
        logger.info(f"Augmented data file: {self.augmented_data_file}")
        logger.info(f"Unlabeled data file: {self.unlabeled_data_file}")
        logger.info(f"Embedding column: {self.embedding_column}")
        logger.info(f"Label column: {self.label_column}")
        logger.info(f"Test size: {self.test_size}")
        logger.info(f"Initial labeled ratio: {self.initial_labeled_ratio}")
        logger.info(f"Random state: {self.random_state}")
        logger.info(f"Batch size: {self.batch_size}")

    def _process_embeddings(self, df):
        def process_embedding(x):
            if isinstance(x, str):
                x = x.strip('[]')
                try:
                    return np.array([float(i) for i in x.split(',')])
                except ValueError:
                    return np.array([float(i) for i in x.split()])
            elif isinstance(x, np.ndarray):
                return x
            else:
                raise ValueError(f"Unexpected embedding format: {type(x)}")

        df[self.embedding_column] = df[self.embedding_column].apply(process_embedding)
        return df

    def _load_and_preprocess_data(self) -> None:
        start_time = time.time()
        logger.info("Loading and preprocessing data...")
        
        # Load labeled data
        self.labeled_data = pd.read_csv(self.labeled_data_file)
        self.labeled_data = self._process_embeddings(self.labeled_data)
        logger.info(f"Labeled data shape: {self.labeled_data.shape}")
        
        # Load augmented data
        self.augmented_data = pd.read_csv(self.augmented_data_file)
        self.augmented_data = self._process_embeddings(self.augmented_data)
        logger.info(f"Augmented data shape: {self.augmented_data.shape}")
        
        # Combine labeled and augmented data for label encoding
        all_data = pd.concat([self.labeled_data, self.augmented_data])
        
        # Encode labels
        self.le.fit(all_data[self.label_column])
        self.labeled_data['encoded_label'] = self.le.transform(self.labeled_data[self.label_column])
        self.augmented_data['encoded_label'] = self.le.transform(self.augmented_data[self.label_column])
        
        # Initialize FAISS index
        self.dimension = len(self.labeled_data[self.embedding_column].iloc[0])
        self.index = faiss.IndexFlatL2(self.dimension)
        
        # Load unlabeled data and add to FAISS index
        self.unlabeled_data = []
        parquet_file = pq.ParquetFile(self.unlabeled_data_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.batch_size, columns=[self.embedding_column]):
            df_chunk = batch.to_pandas()
            df_chunk = self._process_embeddings(df_chunk)
            embeddings = np.stack(df_chunk[self.embedding_column].values)
            self.index.add(embeddings)
            self.unlabeled_data.append(embeddings)
        
        self.unlabeled_data = np.vstack(self.unlabeled_data)
        logger.info(f"Unlabeled data shape: {self.unlabeled_data.shape}")
        logger.info(f"FAISS index size: {self.index.ntotal}")
        
        end_time = time.time()
        logger.info(f"Data loaded and preprocessed in {end_time - start_time:.2f} seconds")

    def prepare_data(self):
        logger.info("Preparing data for active learning")
        
        # Combine labeled and augmented data
        X = np.vstack([
            np.stack(self.labeled_data[self.embedding_column].values),
            np.stack(self.augmented_data[self.embedding_column].values)
        ])
        y = np.concatenate([
            self.labeled_data['encoded_label'].values,
            self.augmented_data['encoded_label'].values
        ])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state, stratify=y)
        
        X_initial, X_pool, y_initial, y_pool = train_test_split(
            X_train, y_train, train_size=self.initial_labeled_ratio, 
            random_state=self.random_state, stratify=y_train
        )
        
        logger.info(f"Initial labeled set shape: {X_initial.shape}")
        logger.info(f"Pool set shape: {X_pool.shape}")
        logger.info(f"Test set shape: {X_test.shape}")
        
        return X_initial, y_initial, X_pool, y_pool, X_test, y_test

    def train_and_evaluate(self, learner, X_test, y_test):
        y_pred = learner.predict(X_test)
        y_pred_proba = learner.predict_proba(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        auc_roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        
        return accuracy, f1, auc_roc

    def find_similar_samples(self, query_embeddings, k=5):
        distances, indices = self.index.search(query_embeddings, k)
        return distances, indices

    def soft_labeling(self, uncertain_samples, uncertain_labels, n_similar=5):
        distances, similar_indices = self.find_similar_samples(uncertain_samples, k=n_similar)
        
        soft_labels = np.zeros((len(similar_indices.flatten()), len(self.le.classes_)))
        for i, (dist, indices) in enumerate(zip(distances, similar_indices)):
            weights = np.exp(-dist)
            soft_labels[i*n_similar:(i+1)*n_similar] = weights[:, np.newaxis] * (uncertain_labels[i] == np.arange(len(self.le.classes_)))
        soft_labels /= soft_labels.sum(axis=1, keepdims=True)
        
        return self.unlabeled_data[similar_indices.flatten()], soft_labels

    def train_with_active_learning(self, X_initial, y_initial, X_pool, y_pool, X_test, y_test, 
                                   n_queries=50, n_instances_per_query=100, n_similar=5):
        logger.info("Starting active learning with CatBoost")
        start_time = time.time()

        catboost_model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            loss_function='MultiClass',
            random_seed=self.random_state,
            eval_metric='MultiClass',
            auto_class_weights='Balanced',
            verbose=100
        )

        learner = ActiveLearner(
            estimator=catboost_model,
            X_training=X_initial,
            y_training=y_initial,
            query_strategy=entropy_sampling
        )

        performance_history = []
        best_accuracy = 0
        iterations_without_improvement = 0
        max_iterations_without_improvement = 10

        for iteration in tqdm(range(n_queries), desc="Active Learning Progress"):
            query_idx, _ = learner.query(X_pool, n_instances=n_instances_per_query)
            
            uncertain_samples = X_pool[query_idx]
            uncertain_labels = y_pool[query_idx]
            
            # Get similar samples and their soft labels
            similar_samples, soft_labels = self.soft_labeling(uncertain_samples, uncertain_labels, n_similar)
            
            # Combine uncertain samples and similar samples
            X_to_teach = np.vstack([uncertain_samples, similar_samples])
            y_to_teach = np.concatenate([uncertain_labels, np.argmax(soft_labels, axis=1)])
            
            # Teach the model
            learner.teach(X_to_teach, y_to_teach)
            
            # Remove queried samples from the pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            
            # Evaluate the model
            accuracy, f1, auc_roc = self.train_and_evaluate(learner, X_test, y_test)
            performance_history.append((accuracy, f1, auc_roc))
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                iterations_without_improvement = 0
                self.best_model = learner
            else:
                iterations_without_improvement += 1

            if (iteration + 1) % 5 == 0:
                logger.info(f"Iteration {iteration + 1}/{n_queries}: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

            if iterations_without_improvement >= max_iterations_without_improvement:
                logger.info(f"Early stopping at iteration {iteration + 1} due to no improvement")
                break

        end_time = time.time()
        logger.info(f"Active learning completed in {end_time - start_time:.2f} seconds")
        logger.info(f"Final performance: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

        return performance_history, learner

    def run_pipeline(self):
        logger.info("Starting CatBoost active learning pipeline")
        start_time = time.time()

        X_initial, y_initial, X_pool, y_pool, X_test, y_test = self.prepare_data()
        performance_history, final_model = self.train_with_active_learning(
            X_initial, y_initial, X_pool, y_pool, X_test, y_test
        )

        y_pred = final_model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, target_names=self.le.classes_)
        
        end_time = time.time()
        logger.info(f"CatBoost Active Learning Pipeline completed in {end_time - start_time:.2f} seconds")
        logger.info("Classification Report:\n" + class_report)

        self.plot_results(performance_history)
        self.plot_confusion_matrix(cm)

        return performance_history, cm, class_report, final_model

    def plot_results(self, performance_history):
        accuracies, f1_scores, auc_rocs = zip(*performance_history)
        plt.figure(figsize=(12, 6))
        plt.plot(accuracies, label='Accuracy')
        plt.plot(f1_scores, label='F1 Score')
        plt.plot(auc_rocs, label='AUC-ROC')
        plt.xlabel('Iteration')
        plt.ylabel('Score')
        plt.title('Model Performance over Active Learning Iterations')
        plt.legend()
        plt.tight_layout()
        plt.savefig('catboost_active_learning_results.png')
        logger.info("Results plot saved as 'catboost_active_learning_results.png'")
        plt.close()

    def plot_confusion_matrix(self, cm):
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        plt.title('Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('catboost_confusion_matrix.png')
        logger.info("Confusion matrix saved as 'catboost_confusion_matrix.png'")
        plt.close()

# Usage
labeled_data_file = 'sbert_data_with_distance_features.csv'
augmented_data_file = 'cleaned_proscience_da.csv'
unlabeled_data_file = r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet'

active_learning_pipeline = CatBoostActiveLearningPipeline(labeled_data_file, augmented_data_file, unlabeled_data_file, "dense_embedding", "cleaned_classification")
performance_history, cm, class_report, final_model = active_learning_pipeline.run_pipeline()

print(class_report)

2024-09-25 12:40:35,014 - INFO - Initializing CatBoostActiveLearningPipeline
2024-09-25 12:40:35,015 - INFO - Labeled data file: sbert_data_with_distance_features.csv
2024-09-25 12:40:35,015 - INFO - Augmented data file: cleaned_proscience_da.csv
2024-09-25 12:40:35,016 - INFO - Unlabeled data file: C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet
2024-09-25 12:40:35,017 - INFO - Embedding column: dense_embedding
2024-09-25 12:40:35,018 - INFO - Label column: cleaned_classification
2024-09-25 12:40:35,019 - INFO - Test size: 0.2
2024-09-25 12:40:35,020 - INFO - Initial labeled ratio: 0.3
2024-09-25 12:40:35,021 - INFO - Random state: 42
2024-09-25 12:40:35,021 - INFO - Batch size: 1000
2024-09-25 12:40:35,022 - INFO - Loading and preprocessing data...
2024-09-25 12:40:47,626 - INFO - Labeled data shape: (79763, 13)
2024-09-25 12:40:48,662 - INFO - Augmented data shape: (5378, 4)
2024-09-25 12:40:52,661 - INFO - Unlabeled data shape: (412879, 384)
2024-09-25 12

0:	learn: 1.0694043	total: 287ms	remaining: 4m 47s
100:	learn: 0.6936732	total: 9.32s	remaining: 1m 22s
200:	learn: 0.5843712	total: 20.8s	remaining: 1m 22s
300:	learn: 0.5123744	total: 34.9s	remaining: 1m 20s
400:	learn: 0.4579681	total: 48.8s	remaining: 1m 12s
500:	learn: 0.4133818	total: 1m 1s	remaining: 1m 1s
600:	learn: 0.3757239	total: 1m 14s	remaining: 49.8s
700:	learn: 0.3440428	total: 1m 28s	remaining: 37.8s
800:	learn: 0.3190920	total: 1m 43s	remaining: 25.7s
900:	learn: 0.2952294	total: 1m 57s	remaining: 12.9s
999:	learn: 0.2750112	total: 2m 10s	remaining: 0us


Active Learning Progress:   0%|          | 0/50 [00:00<?, ?it/s]

0:	learn: 1.0706507	total: 135ms	remaining: 2m 15s
100:	learn: 0.7081786	total: 15.2s	remaining: 2m 15s
200:	learn: 0.6038808	total: 29.1s	remaining: 1m 55s
300:	learn: 0.5342437	total: 42.9s	remaining: 1m 39s
400:	learn: 0.4794415	total: 56.7s	remaining: 1m 24s
500:	learn: 0.4335197	total: 1m 10s	remaining: 1m 10s
600:	learn: 0.3960181	total: 1m 24s	remaining: 55.9s
700:	learn: 0.3645134	total: 1m 38s	remaining: 41.9s
800:	learn: 0.3364766	total: 1m 52s	remaining: 28s
900:	learn: 0.3118957	total: 2m 10s	remaining: 14.3s
999:	learn: 0.2910103	total: 2m 28s	remaining: 0us
0:	learn: 1.0702972	total: 223ms	remaining: 3m 43s
100:	learn: 0.7226446	total: 23.8s	remaining: 3m 32s
200:	learn: 0.6202382	total: 46.2s	remaining: 3m 3s
300:	learn: 0.5499214	total: 1m 5s	remaining: 2m 32s
400:	learn: 0.4957315	total: 1m 25s	remaining: 2m 7s
500:	learn: 0.4507657	total: 1m 43s	remaining: 1m 43s
600:	learn: 0.4133899	total: 2m	remaining: 1m 20s
700:	learn: 0.3804636	total: 2m 17s	remaining: 58.7s
800

2024-09-25 12:58:22,017 - INFO - Iteration 5/50: Accuracy = 0.7524, F1 = 0.6785, AUC-ROC = 0.8860


0:	learn: 1.0730762	total: 198ms	remaining: 3m 17s
100:	learn: 0.7681983	total: 19.4s	remaining: 2m 52s
200:	learn: 0.6741008	total: 34.8s	remaining: 2m 18s
300:	learn: 0.6078540	total: 53.1s	remaining: 2m 3s
400:	learn: 0.5543302	total: 1m 11s	remaining: 1m 46s
500:	learn: 0.5091633	total: 1m 29s	remaining: 1m 28s
600:	learn: 0.4709927	total: 1m 45s	remaining: 1m 10s
700:	learn: 0.4365022	total: 2m 3s	remaining: 52.6s
800:	learn: 0.4067177	total: 2m 20s	remaining: 35s
900:	learn: 0.3806453	total: 2m 38s	remaining: 17.4s
999:	learn: 0.3573030	total: 2m 58s	remaining: 0us
0:	learn: 1.0748881	total: 212ms	remaining: 3m 31s
100:	learn: 0.7774178	total: 24.1s	remaining: 3m 34s
200:	learn: 0.6856289	total: 42.3s	remaining: 2m 48s
300:	learn: 0.6188772	total: 1m 1s	remaining: 2m 21s
400:	learn: 0.5659025	total: 1m 22s	remaining: 2m 3s
500:	learn: 0.5209002	total: 1m 43s	remaining: 1m 43s
600:	learn: 0.4816701	total: 2m 2s	remaining: 1m 21s
700:	learn: 0.4480347	total: 2m 20s	remaining: 59.8s

2024-09-25 13:14:30,214 - INFO - Iteration 10/50: Accuracy = 0.7520, F1 = 0.6781, AUC-ROC = 0.8848


0:	learn: 1.0771502	total: 174ms	remaining: 2m 53s
100:	learn: 0.8064328	total: 18.2s	remaining: 2m 42s
200:	learn: 0.7235872	total: 36.2s	remaining: 2m 23s
300:	learn: 0.6603970	total: 55.8s	remaining: 2m 9s
400:	learn: 0.6078438	total: 1m 13s	remaining: 1m 50s
500:	learn: 0.5642396	total: 1m 31s	remaining: 1m 30s
600:	learn: 0.5267717	total: 1m 50s	remaining: 1m 13s
700:	learn: 0.4931105	total: 2m 9s	remaining: 55.4s
800:	learn: 0.4634636	total: 2m 28s	remaining: 37s
900:	learn: 0.4365160	total: 2m 47s	remaining: 18.4s
999:	learn: 0.4115526	total: 3m 3s	remaining: 0us
0:	learn: 1.0783407	total: 166ms	remaining: 2m 45s
100:	learn: 0.8139096	total: 19.6s	remaining: 2m 54s
200:	learn: 0.7323787	total: 37.9s	remaining: 2m 30s
300:	learn: 0.6710381	total: 54.4s	remaining: 2m 6s
400:	learn: 0.6207600	total: 1m 13s	remaining: 1m 49s
500:	learn: 0.5778823	total: 1m 35s	remaining: 1m 34s
600:	learn: 0.5411553	total: 1m 55s	remaining: 1m 16s
700:	learn: 0.5058850	total: 2m 14s	remaining: 57.2s

2024-09-25 13:31:13,702 - INFO - Iteration 15/50: Accuracy = 0.7553, F1 = 0.6818, AUC-ROC = 0.8846


0:	learn: 1.0816008	total: 147ms	remaining: 2m 27s
100:	learn: 0.8393204	total: 18.6s	remaining: 2m 45s
200:	learn: 0.7587596	total: 37.3s	remaining: 2m 28s
300:	learn: 0.7011721	total: 57.9s	remaining: 2m 14s
400:	learn: 0.6537182	total: 1m 18s	remaining: 1m 57s
500:	learn: 0.6121701	total: 1m 36s	remaining: 1m 36s
600:	learn: 0.5757628	total: 1m 55s	remaining: 1m 16s
700:	learn: 0.5425076	total: 2m 12s	remaining: 56.4s
800:	learn: 0.5133795	total: 2m 29s	remaining: 37.1s
900:	learn: 0.4876971	total: 2m 47s	remaining: 18.4s
999:	learn: 0.4637570	total: 3m 4s	remaining: 0us
0:	learn: 1.0807246	total: 157ms	remaining: 2m 36s
100:	learn: 0.8440908	total: 15.9s	remaining: 2m 21s
200:	learn: 0.7669508	total: 33.5s	remaining: 2m 13s
300:	learn: 0.7100499	total: 50.8s	remaining: 1m 58s
400:	learn: 0.6627244	total: 1m 8s	remaining: 1m 41s
500:	learn: 0.6214245	total: 1m 27s	remaining: 1m 27s
600:	learn: 0.5857818	total: 1m 48s	remaining: 1m 12s
700:	learn: 0.5534521	total: 2m 7s	remaining: 54

2024-09-25 13:47:35,276 - INFO - Iteration 20/50: Accuracy = 0.7493, F1 = 0.6751, AUC-ROC = 0.8843


0:	learn: 1.0819914	total: 164ms	remaining: 2m 43s
100:	learn: 0.8653197	total: 19.8s	remaining: 2m 55s
200:	learn: 0.7917901	total: 38.7s	remaining: 2m 33s
300:	learn: 0.7365635	total: 57.9s	remaining: 2m 14s
400:	learn: 0.6900355	total: 1m 16s	remaining: 1m 54s
500:	learn: 0.6511686	total: 1m 34s	remaining: 1m 34s
600:	learn: 0.6162716	total: 1m 53s	remaining: 1m 15s
700:	learn: 0.5846882	total: 2m 13s	remaining: 56.8s
800:	learn: 0.5570097	total: 2m 34s	remaining: 38.4s
900:	learn: 0.5312911	total: 2m 54s	remaining: 19.2s
999:	learn: 0.5082043	total: 3m 14s	remaining: 0us
0:	learn: 1.0822802	total: 333ms	remaining: 5m 32s
100:	learn: 0.8705503	total: 26.5s	remaining: 3m 55s
200:	learn: 0.7974928	total: 46.7s	remaining: 3m 5s
300:	learn: 0.7414811	total: 1m 7s	remaining: 2m 37s
400:	learn: 0.6955372	total: 1m 27s	remaining: 2m 10s
500:	learn: 0.6563021	total: 1m 46s	remaining: 1m 45s
600:	learn: 0.6218835	total: 2m 6s	remaining: 1m 23s
700:	learn: 0.5918484	total: 2m 26s	remaining: 1

2024-09-25 13:54:46,098 - INFO - Early stopping at iteration 22 due to no improvement
2024-09-25 13:54:46,102 - INFO - Active learning completed in 4432.88 seconds
2024-09-25 13:54:46,103 - INFO - Final performance: Accuracy = 0.7524, F1 = 0.6772, AUC-ROC = 0.8853
2024-09-25 13:54:47,769 - INFO - CatBoost Active Learning Pipeline completed in 4435.10 seconds
2024-09-25 13:54:47,770 - INFO - Classification Report:
              precision    recall  f1-score   support

 antiscience       0.67      0.73      0.70      5337
     neutral       0.87      0.82      0.85      9541
  proscience       0.47      0.50      0.48      2151

    accuracy                           0.75     17029
   macro avg       0.67      0.68      0.68     17029
weighted avg       0.76      0.75      0.76     17029

2024-09-25 13:54:48,269 - INFO - Results plot saved as 'catboost_active_learning_results.png'
2024-09-25 13:54:48,491 - INFO - Confusion matrix saved as 'catboost_confusion_matrix.png'


              precision    recall  f1-score   support

 antiscience       0.67      0.73      0.70      5337
     neutral       0.87      0.82      0.85      9541
  proscience       0.47      0.50      0.48      2151

    accuracy                           0.75     17029
   macro avg       0.67      0.68      0.68     17029
weighted avg       0.76      0.75      0.76     17029



# 2. NN with consitency loss (with mse)

In [2]:
import pandas as pd
import numpy as np
import faiss
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import logging
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import pyarrow.parquet as pq

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PyTorchActiveLearningPipeline:
    def __init__(self, labeled_data_file: str, augmented_data_file: str, unlabeled_data_file: str, 
                 embedding_column: str, label_column: str, test_size: float = 0.2, 
                 initial_labeled_ratio: float = 0.3, random_state: int = 42, batch_size: int = 64):
        self.labeled_data_file = labeled_data_file
        self.augmented_data_file = augmented_data_file
        self.unlabeled_data_file = unlabeled_data_file
        self.embedding_column = embedding_column
        self.label_column = label_column
        self.test_size = test_size
        self.initial_labeled_ratio = initial_labeled_ratio
        self.random_state = random_state
        self.batch_size = batch_size
        self.le = LabelEncoder()
        
        self._log_init_info()
        self._load_and_preprocess_data()
        
    def _log_init_info(self) -> None:
        logger.info("Initializing PyTorchActiveLearningPipeline")
        logger.info(f"Labeled data file: {self.labeled_data_file}")
        logger.info(f"Augmented data file: {self.augmented_data_file}")
        logger.info(f"Unlabeled data file: {self.unlabeled_data_file}")
        logger.info(f"Embedding column: {self.embedding_column}")
        logger.info(f"Label column: {self.label_column}")
        logger.info(f"Test size: {self.test_size}")
        logger.info(f"Initial labeled ratio: {self.initial_labeled_ratio}")
        logger.info(f"Random state: {self.random_state}")
        logger.info(f"Batch size: {self.batch_size}")

    def _process_embeddings(self, df):
        def process_embedding(x):
            if isinstance(x, str):
                x = x.strip('[]')
                try:
                    return np.array([float(i) for i in x.split(',')])
                except ValueError:
                    return np.array([float(i) for i in x.split()])
            elif isinstance(x, np.ndarray):
                return x
            else:
                raise ValueError(f"Unexpected embedding format: {type(x)}")

        df[self.embedding_column] = df[self.embedding_column].apply(process_embedding)
        return df

    def _load_and_preprocess_data(self) -> None:
        start_time = time.time()
        logger.info("Loading and preprocessing data...")
        
        self.labeled_data = pd.read_csv(self.labeled_data_file)
        self.labeled_data = self._process_embeddings(self.labeled_data)
        logger.info(f"Labeled data shape: {self.labeled_data.shape}")
        
        self.augmented_data = pd.read_csv(self.augmented_data_file)
        self.augmented_data = self._process_embeddings(self.augmented_data)
        logger.info(f"Augmented data shape: {self.augmented_data.shape}")
        
        all_data = pd.concat([self.labeled_data, self.augmented_data])
        
        self.le.fit(all_data[self.label_column])
        self.labeled_data['encoded_label'] = self.le.transform(self.labeled_data[self.label_column])
        self.augmented_data['encoded_label'] = self.le.transform(self.augmented_data[self.label_column])
        
        # Load unlabeled data
        self.dimension = len(self.labeled_data[self.embedding_column].iloc[0])
        self.unlabeled_data = []
        parquet_file = pq.ParquetFile(self.unlabeled_data_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.batch_size, columns=[self.embedding_column]):
            df_chunk = batch.to_pandas()
            df_chunk = self._process_embeddings(df_chunk)
            embeddings = np.stack(df_chunk[self.embedding_column].values)
            self.unlabeled_data.append(embeddings)
        
        self.unlabeled_data = np.vstack(self.unlabeled_data)
        logger.info(f"Unlabeled data shape: {self.unlabeled_data.shape}")
        
        # Build FAISS index
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(self.unlabeled_data.astype('float32'))
        logger.info(f"FAISS index size: {self.index.ntotal}")
        
        end_time = time.time()
        logger.info(f"Data loaded and preprocessed in {end_time - start_time:.2f} seconds")

    def prepare_data(self):
        logger.info("Preparing data for active learning")
        
        X = np.vstack([
            np.stack(self.labeled_data[self.embedding_column].values),
            np.stack(self.augmented_data[self.embedding_column].values)
        ])
        y = np.concatenate([
            self.labeled_data['encoded_label'].values,
            self.augmented_data['encoded_label'].values
        ])
        
        # Ensure that the initial labeled set contains samples from all classes
        classes = np.unique(y)
        initial_indices = []
        for cls in classes:
            idx = np.where(y == cls)[0]
            initial_indices.append(idx[0])  # Take the first instance of each class

        remaining_indices = [i for i in range(len(y)) if i not in initial_indices]

        X_initial = X[initial_indices]
        y_initial = y[initial_indices]

        # Now sample the rest of the initial data
        additional_initial_size = int(self.initial_labeled_ratio * len(X)) - len(initial_indices)
        X_remaining = X[remaining_indices]
        y_remaining = y[remaining_indices]

        X_additional_initial, X_pool, y_additional_initial, y_pool = train_test_split(
            X_remaining, y_remaining, train_size=additional_initial_size,
            random_state=self.random_state, stratify=y_remaining
        )

        X_initial = np.vstack([X_initial, X_additional_initial])
        y_initial = np.concatenate([y_initial, y_additional_initial])

        # Split the pool into pool and test sets
        X_pool, X_test, y_pool, y_test = train_test_split(
            X_pool, y_pool, test_size=self.test_size, random_state=self.random_state, stratify=y_pool
        )

        logger.info(f"Initial labeled set shape: {X_initial.shape}")
        logger.info(f"Pool set shape: {X_pool.shape}")
        logger.info(f"Test set shape: {X_test.shape}")
        
        # Convert to PyTorch tensors
        self.X_initial = torch.tensor(X_initial, dtype=torch.float32)
        self.y_initial = torch.tensor(y_initial, dtype=torch.long)
        self.X_pool = torch.tensor(X_pool, dtype=torch.float32)
        self.y_pool = torch.tensor(y_pool, dtype=torch.long)  # For simulation
        self.X_test = torch.tensor(X_test, dtype=torch.float32)
        self.y_test = torch.tensor(y_test, dtype=torch.long)

    def train_and_evaluate(self, model, X_test, y_test):
        model.eval()
        with torch.no_grad():
            outputs = model(X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_proba = F.softmax(outputs, dim=1)

            y_test_np = y_test.numpy()
            y_pred_np = y_pred.numpy()
            y_pred_proba_np = y_pred_proba.numpy()

            accuracy = accuracy_score(y_test_np, y_pred_np)
            f1 = f1_score(y_test_np, y_pred_np, average='macro')
            auc_roc = roc_auc_score(y_test_np, y_pred_proba_np, multi_class='ovr')

        return accuracy, f1, auc_roc

    def get_uncertainty_scores(self, model, X_pool):
        model.eval()
        with torch.no_grad():
            outputs = model(X_pool)
            probabilities = F.softmax(outputs, dim=1)
            entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-5), dim=1)
        return entropy

    def query_samples(self, model, X_pool, n_instances):
        uncertainties = self.get_uncertainty_scores(model, X_pool)
        _, query_indices = torch.topk(uncertainties, n_instances)
        return query_indices

    def find_similar_samples(self, sample, k=5):
        sample_np = sample.numpy().reshape(1, -1).astype('float32')
        distances, indices = self.index.search(sample_np, k)
        return indices[0]  # Return indices of similar samples

    def train_with_active_learning(self, n_queries=50, n_instances_per_query=100, k_similar=5):
        logger.info("Starting active learning with PyTorch and consistency regularization")
        start_time = time.time()

        input_dim = self.X_initial.shape[1]
        num_classes = len(self.le.classes_)
        model = ClassificationModel(input_dim, num_classes)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        num_epochs = 5
        lambda_c = 1.0  # Weight for consistency loss

        # Initialize labeled dataset and dataloader
        X_labeled = self.X_initial.clone()
        y_labeled = self.y_initial.clone()

        labeled_dataset = LabeledDataset(X_labeled, y_labeled)
        labeled_loader = DataLoader(labeled_dataset, batch_size=self.batch_size, shuffle=True)

        X_pool = self.X_pool.clone()
        y_pool = self.y_pool.clone()
        X_unlabeled = torch.tensor(self.unlabeled_data, dtype=torch.float32)

        performance_history = []
        best_accuracy = 0
        iterations_without_improvement = 0
        max_iterations_without_improvement = 10

        for query_round in tqdm(range(n_queries), desc="Active Learning Progress"):
            logger.info(f"Query Round {query_round + 1}/{n_queries}")

            # Training the model
            for epoch in range(num_epochs):
                model.train()
                total_loss = 0.0

                for X_batch, y_batch in labeled_loader:
                    optimizer.zero_grad()
                    outputs = model(X_batch)
                    supervised_loss = F.cross_entropy(outputs, y_batch)

                    # No consistency loss in initial training
                    consistency_loss = 0.0

                    loss = supervised_loss + lambda_c * consistency_loss
                    loss.backward()
                    optimizer.step()

                    total_loss += loss.item()

                logger.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(labeled_loader):.4f}")

            # Query new samples
            query_indices = self.query_samples(model, X_pool, n_instances_per_query)
            X_query = X_pool[query_indices]
            y_query = y_pool[query_indices]  # For simulation; in practice, get labels from oracle

            # Find similar samples using FAISS
            similar_indices_list = []
            for sample in X_query:
                indices = self.find_similar_samples(sample, k=k_similar)
                similar_indices_list.extend(indices)

            similar_indices = list(set(similar_indices_list))
            X_similar = X_unlabeled[similar_indices]

            # Remove similar samples from X_unlabeled and update FAISS index
            mask_unlabeled = torch.ones(len(X_unlabeled), dtype=torch.bool)
            mask_unlabeled[similar_indices] = False
            X_unlabeled = X_unlabeled[mask_unlabeled]

            # Rebuild FAISS index
            self.index = faiss.IndexFlatL2(self.dimension)
            self.index.add(X_unlabeled.numpy().astype('float32'))

            # Update labeled dataset
            X_labeled = torch.cat([X_labeled, X_query], dim=0)
            y_labeled = torch.cat([y_labeled, y_query], dim=0)

            # Remove queried samples from X_pool and y_pool
            mask_pool = torch.ones(len(X_pool), dtype=torch.bool)
            mask_pool[query_indices] = False
            X_pool = X_pool[mask_pool]
            y_pool = y_pool[mask_pool]

            # Update the labeled dataset and dataloader
            labeled_dataset = LabeledDataset(X_labeled, y_labeled)
            labeled_loader = DataLoader(labeled_dataset, batch_size=self.batch_size, shuffle=True)

            # Retrain the model with consistency regularization
            for epoch in range(num_epochs):
                model.train()
                total_loss = 0.0

                for X_batch, y_batch in labeled_loader:
                    optimizer.zero_grad()
                    outputs = model(X_batch)
                    supervised_loss = F.cross_entropy(outputs, y_batch)

                    # Consistency loss between queried samples and similar samples
                    consistency_loss = 0.0
                    if X_query.size(0) > 0 and X_similar.size(0) > 0:
                        outputs_query = model(X_query)
                        probs_query = F.softmax(outputs_query, dim=1)

                        outputs_similar = model(X_similar)
                        probs_similar = F.softmax(outputs_similar, dim=1)

                        # For each queried sample, compute consistency loss with its similar samples
                        consistency_loss = F.mse_loss(probs_query.mean(dim=0), probs_similar.mean(dim=0))

                    # Total loss
                    loss = supervised_loss + lambda_c * consistency_loss
                    loss.backward()
                    optimizer.step()

                    total_loss += loss.item()

                logger.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(labeled_loader):.4f}")

            # Evaluate the model
            accuracy, f1, auc_roc = self.train_and_evaluate(model, self.X_test, self.y_test)
            performance_history.append((accuracy, f1, auc_roc))

            # Early stopping logic
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                iterations_without_improvement = 0
                best_model = model
            else:
                iterations_without_improvement += 1

            if (query_round + 1) % 5 == 0:
                logger.info(f"Iteration {query_round + 1}/{n_queries}: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

            if iterations_without_improvement >= max_iterations_without_improvement:
                logger.info(f"Early stopping at iteration {query_round + 1} due to no improvement")
                break

        end_time = time.time()
        logger.info(f"Active learning completed in {end_time - start_time:.2f} seconds")
        logger.info(f"Final performance: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

        return performance_history, best_model

    def run_pipeline(self):
        logger.info("Starting PyTorch active learning pipeline")
        start_time = time.time()

        self.prepare_data()
        performance_history, final_model = self.train_with_active_learning()

        # Evaluate final model
        final_model.eval()
        with torch.no_grad():
            outputs = final_model(self.X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_np = y_pred.numpy()
            y_test_np = self.y_test.numpy()

            cm = confusion_matrix(y_test_np, y_pred_np)
            class_report = classification_report(y_test_np, y_pred_np, target_names=self.le.classes_)

        end_time = time.time()
        logger.info(f"PyTorch Active Learning Pipeline completed in {end_time - start_time:.2f} seconds")
        logger.info("Classification Report:\n" + class_report)

        self.plot_performance_history(performance_history)
        self.plot_confusion_matrix(cm)

        return performance_history, cm, class_report, final_model

    def plot_performance_history(self, performance_history):
        queries = range(1, len(performance_history) + 1)
        accuracies, f1_scores, auc_rocs = zip(*performance_history)

        # Plot accuracy
        plt.figure(figsize=(10, 6))
        plt.plot(queries, accuracies, label='Accuracy', marker='o')
        plt.xlabel('Iteration Number')
        plt.ylabel('Test Accuracy')
        plt.title('Performance (Accuracy) Over Active Learning Iterations')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('accuracy_history.png')
        plt.close()

        # Plot F1 Score
        plt.figure(figsize=(10, 6))
        plt.plot(queries, f1_scores, label='F1 Score', marker='o')
        plt.xlabel('Iteration Number')
        plt.ylabel('F1 Score')
        plt.title('Performance (F1 Score) Over Active Learning Iterations')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('f1_score_history.png')
        plt.close()

        # Plot AUC-ROC
        plt.figure(figsize=(10, 6))
        plt.plot(queries, auc_rocs, label='AUC-ROC', marker='o')
        plt.xlabel('Iteration Number')
        plt.ylabel('AUC-ROC Score')
        plt.title('Performance (AUC-ROC) Over Active Learning Iterations')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('auc_roc_history.png')
        plt.close()

    def plot_confusion_matrix(self, cm):
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        plt.title('FAISS and Consistency Regularization Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('pytorch_confusion_matrix.png')
        logger.info("Confusion matrix saved as 'pytorch_confusion_matrix.png'")
        plt.close()

class LabeledDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class ClassificationModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Usage
labeled_data_file = 'sbert_data_with_distance_features.csv'
augmented_data_file = 'cleaned_proscience_da.csv'
unlabeled_data_file = r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet'

active_learning_pipeline = PyTorchActiveLearningPipeline(
    labeled_data_file=labeled_data_file,
    augmented_data_file=augmented_data_file,
    unlabeled_data_file=unlabeled_data_file,
    embedding_column="dense_embedding",
    label_column="cleaned_classification"
)

performance_history, cm, class_report, final_model = active_learning_pipeline.run_pipeline()

print("Classification Report:")
print(class_report)

print("\nConfusion Matrix:")
print(cm)

print("\nAccuracy history saved as 'nn_accuracy_history.png'")
print("AUC-ROC history saved as 'nn_auc_roc_history.png'")

print("\nFinal Model Performance:")
print(f"Accuracy: {performance_history[-1][0]:.4f}")
print(f"F1 Score: {performance_history[-1][1]:.4f}")
print(f"AUC-ROC: {performance_history[-1][2]:.4f}")

2024-09-26 15:50:05,405 - INFO - Initializing PyTorchActiveLearningPipeline
2024-09-26 15:50:05,406 - INFO - Labeled data file: sbert_data_with_distance_features.csv
2024-09-26 15:50:05,407 - INFO - Augmented data file: cleaned_proscience_da.csv
2024-09-26 15:50:05,408 - INFO - Unlabeled data file: C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet
2024-09-26 15:50:05,408 - INFO - Embedding column: dense_embedding
2024-09-26 15:50:05,409 - INFO - Label column: cleaned_classification
2024-09-26 15:50:05,409 - INFO - Test size: 0.2
2024-09-26 15:50:05,410 - INFO - Initial labeled ratio: 0.3
2024-09-26 15:50:05,411 - INFO - Random state: 42
2024-09-26 15:50:05,411 - INFO - Batch size: 64
2024-09-26 15:50:05,412 - INFO - Loading and preprocessing data...
2024-09-26 15:50:18,163 - INFO - Labeled data shape: (79763, 13)
2024-09-26 15:50:19,238 - INFO - Augmented data shape: (5378, 4)
2024-09-26 15:50:26,313 - INFO - Unlabeled data shape: (412879, 384)
2024-09-26 15:50

Active Learning Progress:   0%|          | 0/50 [00:00<?, ?it/s]

2024-09-26 15:50:28,289 - INFO - Query Round 1/50
2024-09-26 15:50:29,964 - INFO - Epoch [1/5], Loss: 0.6735
2024-09-26 15:50:31,470 - INFO - Epoch [2/5], Loss: 0.6060
2024-09-26 15:50:33,003 - INFO - Epoch [3/5], Loss: 0.5865
2024-09-26 15:50:34,540 - INFO - Epoch [4/5], Loss: 0.5744
2024-09-26 15:50:36,063 - INFO - Epoch [5/5], Loss: 0.5623
2024-09-26 15:50:44,336 - INFO - Epoch [1/5], Loss: 0.5548
2024-09-26 15:50:47,958 - INFO - Epoch [2/5], Loss: 0.5448
2024-09-26 15:50:51,429 - INFO - Epoch [3/5], Loss: 0.5333
2024-09-26 15:50:55,015 - INFO - Epoch [4/5], Loss: 0.5215
2024-09-26 15:50:58,629 - INFO - Epoch [5/5], Loss: 0.5126
2024-09-26 15:50:58,737 - INFO - Query Round 2/50
2024-09-26 15:51:00,502 - INFO - Epoch [1/5], Loss: 0.5005
2024-09-26 15:51:02,724 - INFO - Epoch [2/5], Loss: 0.4899
2024-09-26 15:51:04,826 - INFO - Epoch [3/5], Loss: 0.4780
2024-09-26 15:51:07,177 - INFO - Epoch [4/5], Loss: 0.4682
2024-09-26 15:51:09,435 - INFO - Epoch [5/5], Loss: 0.4581
2024-09-26 15:5

Classification Report:
              precision    recall  f1-score   support

 antiscience       0.71      0.73      0.72      3736
     neutral       0.84      0.88      0.86      6678
  proscience       0.66      0.48      0.56      1506

    accuracy                           0.78     11920
   macro avg       0.74      0.69      0.71     11920
weighted avg       0.78      0.78      0.78     11920


Confusion Matrix:
[[2713  805  218]
 [ 663 5869  146]
 [ 449  338  719]]

Accuracy history saved as 'nn_accuracy_history.png'
AUC-ROC history saved as 'nn_auc_roc_history.png'

Final Model Performance:
Accuracy: 0.7803
F1 Score: 0.7102
AUC-ROC: 0.8970


# 3. NN with one sided kl divergence and focal loss

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import faiss
import logging
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
from tqdm.auto import tqdm
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MultilabelFocalLoss(nn.Module):
    def __init__(self, alpha, gamma=2, reduction='mean'):
        super(MultilabelFocalLoss, self).__init__()
        self.alpha = torch.tensor(alpha)
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        alpha_t = self.alpha[targets.data.view(-1).long()].view_as(targets)
        F_loss = alpha_t * (1-pt)**self.gamma * BCE_loss

        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

def adaptive_temperature(model_output, base_temperature=1.0, min_temperature=0.5, max_temperature=5.0):
    confidence = F.softmax(model_output, dim=1).max(1)[0].mean()
    temperature = base_temperature / confidence
    return torch.clamp(temperature, min_temperature, max_temperature)

def one_sided_kl_divergence(model, labeled_embeddings, similar_embeddings, temperature):
    with torch.no_grad():
        labeled_logits = model(labeled_embeddings)
        labeled_probs = F.softmax(labeled_logits / temperature, dim=-1)
        labeled_probs_mean = labeled_probs.mean(dim=0)
    
    similar_logits = model(similar_embeddings)
    similar_probs = F.softmax(similar_logits / temperature, dim=-1)
    
    # Expand labeled_probs_mean to match the batch size of similar_probs
    labeled_probs_mean_expanded = labeled_probs_mean.unsqueeze(0).expand(similar_probs.size(0), -1)
    
    kl_div = F.kl_div(similar_probs.log(), labeled_probs_mean_expanded, reduction='batchmean')
    return kl_div

class LabeledDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class ClassificationModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class AdvancedActiveLearningPipeline:
    def __init__(self, labeled_data_file, augmented_data_file, unlabeled_data_file,
                 embedding_column, label_column, test_size=0.2, initial_labeled_ratio=0.3,
                 random_state=42, batch_size=64):
        self.labeled_data_file = labeled_data_file
        self.augmented_data_file = augmented_data_file
        self.unlabeled_data_file = unlabeled_data_file
        self.embedding_column = embedding_column
        self.label_column = label_column
        self.test_size = test_size
        self.initial_labeled_ratio = initial_labeled_ratio
        self.random_state = random_state
        self.batch_size = batch_size
        self.le = LabelEncoder()

        self._load_and_preprocess_data()

    def _process_embeddings(self, df):
        def process_embedding(x):
            if isinstance(x, str):
                x = x.strip('[]')
                try:
                    return np.array([float(i) for i in x.split(',')])
                except ValueError:
                    return np.array([float(i) for i in x.split()])
            elif isinstance(x, np.ndarray):
                return x
            else:
                raise ValueError(f"Unexpected embedding format: {type(x)}")

        df[self.embedding_column] = df[self.embedding_column].apply(process_embedding)
        return df

    def _load_and_preprocess_data(self):
        logger.info("Loading and preprocessing data...")
        
        self.labeled_data = pd.read_csv(self.labeled_data_file)
        self.labeled_data = self._process_embeddings(self.labeled_data)
        
        self.augmented_data = pd.read_csv(self.augmented_data_file)
        self.augmented_data = self._process_embeddings(self.augmented_data)
        
        all_data = pd.concat([self.labeled_data, self.augmented_data])
        
        self.le.fit(all_data[self.label_column])
        self.labeled_data['encoded_label'] = self.le.transform(self.labeled_data[self.label_column])
        self.augmented_data['encoded_label'] = self.le.transform(self.augmented_data[self.label_column])
        
        self.dimension = len(self.labeled_data[self.embedding_column].iloc[0])
        self.unlabeled_data = []
        parquet_file = pq.ParquetFile(self.unlabeled_data_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.batch_size, columns=[self.embedding_column]):
            df_chunk = batch.to_pandas()
            df_chunk = self._process_embeddings(df_chunk)
            embeddings = np.stack(df_chunk[self.embedding_column].values)
            self.unlabeled_data.append(embeddings)
        
        self.unlabeled_data = np.vstack(self.unlabeled_data)
        
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(self.unlabeled_data.astype('float32'))

        logger.info(f"Labeled data shape: {self.labeled_data.shape}")
        logger.info(f"Augmented data shape: {self.augmented_data.shape}")
        logger.info(f"Unlabeled data shape: {self.unlabeled_data.shape}")
        logger.info(f"FAISS index size: {self.index.ntotal}")

    def prepare_data(self):
        logger.info("Preparing data for active learning")
        
        X = np.vstack([
            np.stack(self.labeled_data[self.embedding_column].values),
            np.stack(self.augmented_data[self.embedding_column].values)
        ])
        y = np.concatenate([
            self.labeled_data['encoded_label'].values,
            self.augmented_data['encoded_label'].values
        ])
        
        classes, class_counts = np.unique(y, return_counts=True)
        class_weights = 1 / (class_counts / np.sum(class_counts))
        self.class_weights = class_weights / np.sum(class_weights)
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        initial_size = int(self.initial_labeled_ratio * len(X_train))
        X_initial, X_pool, y_initial, y_pool = train_test_split(
            X_train, y_train, train_size=initial_size,
            random_state=self.random_state, stratify=y_train
        )
        
        self.X_initial = torch.tensor(X_initial, dtype=torch.float32)
        self.y_initial = torch.tensor(y_initial, dtype=torch.long)
        self.X_pool = torch.tensor(X_pool, dtype=torch.float32)
        self.y_pool = torch.tensor(y_pool, dtype=torch.long)
        self.X_test = torch.tensor(X_test, dtype=torch.float32)
        self.y_test = torch.tensor(y_test, dtype=torch.long)

    def evaluate_model(self, model, X, y):
        model.eval()
        with torch.no_grad():
            outputs = model(X)
            _, preds = torch.max(outputs, 1)
            probs = F.softmax(outputs, dim=1)
        
        y_np = y.numpy()
        preds_np = preds.numpy()
        probs_np = probs.numpy()
        
        accuracy = accuracy_score(y_np, preds_np)
        precision, recall, f1, _ = precision_recall_fscore_support(y_np, preds_np, average='macro')
        
        # Calculate AUC-ROC
        auc_roc = roc_auc_score(y_np, probs_np, multi_class='ovr', average='macro')
        
        return accuracy, precision, recall, f1, auc_roc

    def train_with_active_learning(self, n_queries=50, n_instances_per_query=100, k_similar=5):
        logger.info("Starting advanced active learning pipeline")
        
        num_classes = len(self.le.classes_)
        model = ClassificationModel(self.dimension, num_classes)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        focal_loss = MultilabelFocalLoss(alpha=self.class_weights.tolist())
        
        X_labeled = self.X_initial.clone()
        y_labeled = self.y_initial.clone()
        
        performance_history = []
        
        for query_round in tqdm(range(n_queries), desc="Active Learning Progress"):
            labeled_dataset = LabeledDataset(X_labeled, y_labeled)
            labeled_loader = DataLoader(labeled_dataset, batch_size=self.batch_size, shuffle=True)
            
            # Training loop
            model.train()
            for epoch in range(5):  # You can adjust the number of epochs
                for X_batch, y_batch in labeled_loader:
                    optimizer.zero_grad()
                    outputs = model(X_batch)
                    loss = focal_loss(outputs, F.one_hot(y_batch, num_classes=num_classes).float())
                    
                    # Consistency regularization with ramp-up schedule
                    if query_round > 0:
                        ramp_up_factor = min(query_round / (n_queries / 2), 1.0)  # Linear ramp-up
                        temperature = adaptive_temperature(outputs)
                        consistency_loss = one_sided_kl_divergence(model, X_labeled, X_similar, temperature)
                        loss += ramp_up_factor * consistency_loss
                    
                    loss.backward()
                    optimizer.step()
            
            # Query new samples
            model.eval()
            with torch.no_grad():
                pool_outputs = model(self.X_pool)
                pool_probs = F.softmax(pool_outputs, dim=1)
                uncertainties = -(pool_probs * torch.log(pool_probs + 1e-8)).sum(1)
                query_indices = uncertainties.topk(n_instances_per_query).indices
            
            X_query = self.X_pool[query_indices]
            y_query = self.y_pool[query_indices]
            
            # Find similar samples using FAISS
            similar_indices = []
            for sample in X_query:
                _, indices = self.index.search(sample.numpy().reshape(1, -1).astype('float32'), k_similar)
                similar_indices.extend(indices[0])
            
            X_similar = torch.tensor(self.unlabeled_data[similar_indices], dtype=torch.float32)
            
            # Update labeled dataset
            X_labeled = torch.cat([X_labeled, X_query])
            y_labeled = torch.cat([y_labeled, y_query])
            
            # Remove queried samples from pool
            mask = torch.ones(len(self.X_pool), dtype=torch.bool)
            mask[query_indices] = False
            self.X_pool = self.X_pool[mask]
            self.y_pool = self.y_pool[mask]
            
            #Evaluate model
            accuracy, precision, recall, f1, auc_roc = self.evaluate_model(model, self.X_test, self.y_test)
            performance_history.append((accuracy, precision, recall, f1, auc_roc))
            
            logger.info(f"Query {query_round + 1}, Accuracy: {accuracy:.4f}, F1: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
        
        return performance_history, model

    def plot_performance_history(self, performance_history):
        queries = range(1, len(performance_history) + 1)
        accuracies, _, _, _, auc_rocs = zip(*performance_history)

        # Plot accuracy
        plt.figure(figsize=(10, 6))
        plt.plot(queries, accuracies, label='QBC Accuracy', marker='o')
        plt.xlabel('Iteration Number')
        plt.ylabel('Test Accuracy')
        plt.title('QBC Performance (Accuracy) Over Active Learning Iterations')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('qbc_accuracy_history.png')
        plt.close()

        # Plot AUC-ROC
        plt.figure(figsize=(10, 6))
        plt.plot(queries, auc_rocs, label='QBC AUC-ROC', marker='o')
        plt.xlabel('Iteration Number')
        plt.ylabel('AUC-ROC Score')
        plt.title('QBC Performance (AUC-ROC) Over Active Learning Iterations')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('qbc_auc_roc_history.png')
        plt.close()

    def plot_confusion_matrix(self, y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        plt.title('Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')
        plt.close()

    def run_pipeline(self):
        self.prepare_data()
        performance_history, final_model = self.train_with_active_learning()
        
        # Final evaluation
        final_accuracy, final_precision, final_recall, final_f1, final_auc_roc = self.evaluate_model(final_model, self.X_test, self.y_test)
        
        logger.info(f"Final Test Accuracy: {final_accuracy:.4f}")
        logger.info(f"Final Test Precision (macro): {final_precision:.4f}")
        logger.info(f"Final Test Recall (macro): {final_recall:.4f}")
        logger.info(f"Final Test F1 Score (macro): {final_f1:.4f}")
        logger.info(f"Final Test AUC-ROC (macro): {final_auc_roc:.4f}")
        
        # Plot performance history
        self.plot_performance_history(performance_history)
        
        # Plot confusion matrix
        with torch.no_grad():
            test_outputs = final_model(self.X_test)
            _, test_preds = torch.max(test_outputs, 1)
        self.plot_confusion_matrix(self.y_test.numpy(), test_preds.numpy())
        
        return performance_history, final_model, (final_accuracy, final_precision, final_recall, final_f1, final_auc_roc)

# Usage
labeled_data_file = 'sbert_data_with_distance_features.csv'
augmented_data_file = 'cleaned_proscience_da.csv'
unlabeled_data_file = r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet'

pipeline = AdvancedActiveLearningPipeline(
    labeled_data_file=labeled_data_file,
    augmented_data_file=augmented_data_file,
    unlabeled_data_file=unlabeled_data_file,
    embedding_column="dense_embedding",
    label_column="cleaned_classification"
)

performance_history, final_model, final_metrics = pipeline.run_pipeline()

print(f"Final Accuracy: {final_metrics[0]:.4f}")
print(f"Final Precision: {final_metrics[1]:.4f}")
print(f"Final Recall: {final_metrics[2]:.4f}")
print(f"Final F1 Score: {final_metrics[3]:.4f}")

print("\nAccuracy history saved as 'qbc_accuracy_history.png'")
print("AUC-ROC history saved as 'qbc_auc_roc_history.png'")
print("Confusion matrix saved as 'confusion_matrix.png'")

2024-09-26 01:44:48,981 - INFO - Loading and preprocessing data...
2024-09-26 01:45:08,738 - INFO - Labeled data shape: (79763, 13)
2024-09-26 01:45:08,739 - INFO - Augmented data shape: (5378, 5)
2024-09-26 01:45:08,740 - INFO - Unlabeled data shape: (412879, 384)
2024-09-26 01:45:08,741 - INFO - FAISS index size: 412879
2024-09-26 01:45:08,747 - INFO - Preparing data for active learning
2024-09-26 01:45:09,442 - INFO - Starting advanced active learning pipeline


Active Learning Progress:   0%|          | 0/50 [00:00<?, ?it/s]

2024-09-26 01:45:21,814 - INFO - Query 1, Accuracy: 0.7597, F1: 0.6395, AUC-ROC: 0.8806
2024-09-26 01:47:18,405 - INFO - Query 2, Accuracy: 0.7527, F1: 0.6328, AUC-ROC: 0.8785
2024-09-26 01:49:39,327 - INFO - Query 3, Accuracy: 0.7561, F1: 0.6624, AUC-ROC: 0.8792
2024-09-26 01:52:15,026 - INFO - Query 4, Accuracy: 0.7554, F1: 0.6577, AUC-ROC: 0.8782
2024-09-26 01:54:40,006 - INFO - Query 5, Accuracy: 0.7532, F1: 0.6652, AUC-ROC: 0.8768
2024-09-26 01:57:11,211 - INFO - Query 6, Accuracy: 0.7566, F1: 0.6628, AUC-ROC: 0.8783
2024-09-26 01:59:54,612 - INFO - Query 7, Accuracy: 0.7511, F1: 0.6673, AUC-ROC: 0.8758
2024-09-26 02:02:38,332 - INFO - Query 8, Accuracy: 0.7497, F1: 0.6604, AUC-ROC: 0.8738
2024-09-26 02:05:34,754 - INFO - Query 9, Accuracy: 0.7471, F1: 0.6598, AUC-ROC: 0.8735
2024-09-26 02:08:27,075 - INFO - Query 10, Accuracy: 0.7551, F1: 0.6613, AUC-ROC: 0.8746
2024-09-26 02:11:24,628 - INFO - Query 11, Accuracy: 0.7563, F1: 0.6638, AUC-ROC: 0.8744
2024-09-26 02:14:24,439 - INFO

KeyboardInterrupt: 

# SOFT-LABELLING: with Siamese Network and transfomer architecture. 

In [1]:
import pandas as pd
import numpy as np
import faiss
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import logging
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import pyarrow.parquet as pq

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PyTorchActiveLearningPipeline:
    def __init__(self, labeled_data_file: str, augmented_data_file: str, unlabeled_data_file: str, 
                 embedding_column: str, label_column: str, test_size: float = 0.2, 
                 initial_labeled_ratio: float = 0.3, random_state: int = 42, batch_size: int = 64):
        self.labeled_data_file = labeled_data_file
        self.augmented_data_file = augmented_data_file
        self.unlabeled_data_file = unlabeled_data_file
        self.embedding_column = embedding_column
        self.label_column = label_column
        self.test_size = test_size
        self.initial_labeled_ratio = initial_labeled_ratio
        self.random_state = random_state
        self.batch_size = batch_size
        self.le = LabelEncoder()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self._log_init_info()
        self._load_and_preprocess_data()
        
    def _log_init_info(self) -> None:
        logger.info("Initializing PyTorchActiveLearningPipeline")
        logger.info(f"Labeled data file: {self.labeled_data_file}")
        logger.info(f"Augmented data file: {self.augmented_data_file}")
        logger.info(f"Unlabeled data file: {self.unlabeled_data_file}")
        logger.info(f"Embedding column: {self.embedding_column}")
        logger.info(f"Label column: {self.label_column}")
        logger.info(f"Test size: {self.test_size}")
        logger.info(f"Initial labeled ratio: {self.initial_labeled_ratio}")
        logger.info(f"Random state: {self.random_state}")
        logger.info(f"Batch size: {self.batch_size}")

    def _process_embeddings(self, df):
        def process_embedding(x):
            if isinstance(x, str):
                x = x.strip('[]')
                try:
                    return np.array([float(i) for i in x.split(',')])
                except ValueError:
                    return np.array([float(i) for i in x.split()])
            elif isinstance(x, np.ndarray):
                return x
            else:
                raise ValueError(f"Unexpected embedding format: {type(x)}")

        df[self.embedding_column] = df[self.embedding_column].apply(process_embedding)
        return df

    def _load_and_preprocess_data(self) -> None:
        start_time = time.time()
        logger.info("Loading and preprocessing data...")
        
        self.labeled_data = pd.read_csv(self.labeled_data_file)
        self.labeled_data = self._process_embeddings(self.labeled_data)
        logger.info(f"Labeled data shape: {self.labeled_data.shape}")
        
        self.augmented_data = pd.read_csv(self.augmented_data_file)
        self.augmented_data = self._process_embeddings(self.augmented_data)
        logger.info(f"Augmented data shape: {self.augmented_data.shape}")
        
        all_data = pd.concat([self.labeled_data, self.augmented_data], ignore_index=True)
        
        self.le.fit(all_data[self.label_column])
        self.labeled_data['encoded_label'] = self.le.transform(self.labeled_data[self.label_column])
        self.augmented_data['encoded_label'] = self.le.transform(self.augmented_data[self.label_column])
        
        # Load unlabeled data
        self.dimension = len(self.labeled_data[self.embedding_column].iloc[0])
        self.unlabeled_data = []
        parquet_file = pq.ParquetFile(self.unlabeled_data_file)
        
        # Read in batches to manage memory
        for batch in parquet_file.iter_batches(batch_size=10000, columns=[self.embedding_column]):
            df_chunk = batch.to_pandas()
            df_chunk = self._process_embeddings(df_chunk)
            embeddings = np.stack(df_chunk[self.embedding_column].values)
            self.unlabeled_data.append(embeddings)
        
        self.unlabeled_data = np.vstack(self.unlabeled_data)
        logger.info(f"Unlabeled data shape: {self.unlabeled_data.shape}")
        
        # Build FAISS index
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(self.unlabeled_data.astype('float32'))
        logger.info(f"FAISS index size: {self.index.ntotal}")
        
        end_time = time.time()
        logger.info(f"Data loaded and preprocessed in {end_time - start_time:.2f} seconds")

    def prepare_data(self):
        logger.info("Preparing data for active learning")
        
        X = np.vstack([
            np.stack(self.labeled_data[self.embedding_column].values),
            np.stack(self.augmented_data[self.embedding_column].values)
        ])
        y = np.concatenate([
            self.labeled_data['encoded_label'].values,
            self.augmented_data['encoded_label'].values
        ])
        
        # Ensure that the initial labeled set contains samples from all classes
        classes = np.unique(y)
        initial_indices = []
        for cls in classes:
            idx = np.where(y == cls)[0]
            initial_indices.append(idx[0])  # Take the first instance of each class

        remaining_indices = [i for i in range(len(y)) if i not in initial_indices]

        X_initial = X[initial_indices]
        y_initial = y[initial_indices]

        # Now sample the rest of the initial data
        additional_initial_size = int(self.initial_labeled_ratio * len(X)) - len(initial_indices)
        X_remaining = X[remaining_indices]
        y_remaining = y[remaining_indices]

        X_additional_initial, X_pool, y_additional_initial, y_pool = train_test_split(
            X_remaining, y_remaining, train_size=additional_initial_size,
            random_state=self.random_state, stratify=y_remaining
        )

        X_initial = np.vstack([X_initial, X_additional_initial])
        y_initial = np.concatenate([y_initial, y_additional_initial])

        # Split the pool into pool and test sets
        X_pool, X_test, y_pool, y_test = train_test_split(
            X_pool, y_pool, test_size=self.test_size, random_state=self.random_state, stratify=y_pool
        )

        logger.info(f"Initial labeled set shape: {X_initial.shape}")
        logger.info(f"Pool set shape: {X_pool.shape}")
        logger.info(f"Test set shape: {X_test.shape}")
        
        # Convert to PyTorch tensors
        self.X_initial = torch.tensor(X_initial, dtype=torch.float32).to(self.device)
        self.y_initial = torch.tensor(y_initial, dtype=torch.long).to(self.device)
        self.X_pool = torch.tensor(X_pool, dtype=torch.float32).to(self.device)
        self.y_pool = torch.tensor(y_pool, dtype=torch.long).to(self.device)  # For simulation
        self.X_test = torch.tensor(X_test, dtype=torch.float32).to(self.device)
        self.y_test = torch.tensor(y_test, dtype=torch.long).to(self.device)

    def train_and_evaluate(self, classifier, X_test, y_test):
        classifier.eval()
        with torch.no_grad():
            outputs = classifier(X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_proba = F.softmax(outputs, dim=1)

            y_test_np = y_test.cpu().numpy()
            y_pred_np = y_pred.cpu().numpy()
            y_pred_proba_np = y_pred_proba.cpu().numpy()

            accuracy = accuracy_score(y_test_np, y_pred_np)
            f1 = f1_score(y_test_np, y_pred_np, average='macro')
            auc_roc = roc_auc_score(y_test_np, y_pred_proba_np, multi_class='ovr')

        return accuracy, f1, auc_roc

    def get_uncertainty_scores(self, classifier, X_pool):
        classifier.eval()
        with torch.no_grad():
            outputs = classifier(X_pool)
            probabilities = F.softmax(outputs, dim=1)
            entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-5), dim=1)
        return entropy

    def query_samples(self, classifier, X_pool, n_instances):
        uncertainties = self.get_uncertainty_scores(classifier, X_pool)
        _, query_indices = torch.topk(uncertainties, n_instances)
        return query_indices

    def find_similar_samples(self, sample, k=5):
        sample_np = sample.cpu().numpy().reshape(1, -1).astype('float32')
        distances, indices = self.index.search(sample_np, k)
        return indices[0]  # Return indices of similar samples

    def train_with_active_learning(self, n_queries=50, n_instances_per_query=100, k_similar=5):
        logger.info("Starting active learning with Siamese network and Transformer architecture")
        start_time = time.time()

        input_dim = self.X_initial.shape[1]  # Should be 384
        embedding_dim = 128  # Adjusted as per your preference
        num_classes = len(self.le.classes_)
        num_epochs_siamese = 5
        num_epochs_classifier = 5
        margin = 1.0  # For contrastive loss

        # Initialize Siamese network
        embedding_model = TransformerEmbedding(input_dim=input_dim, embedding_dim=embedding_dim).to(self.device)
        optimizer_siamese = torch.optim.Adam(embedding_model.parameters(), lr=1e-3)

        # Initialize classifier
        classifier = SiameseClassifier(embedding_model, num_classes).to(self.device)
        optimizer_classifier = torch.optim.Adam(classifier.parameters(), lr=1e-3)

        # Initialize labeled dataset
        X_labeled = self.X_initial.clone()
        y_labeled = self.y_initial.clone()

        X_pool = self.X_pool.clone()
        y_pool = self.y_pool.clone()
        X_unlabeled = torch.tensor(self.unlabeled_data, dtype=torch.float32).to(self.device)

        performance_history = []
        best_accuracy = 0
        iterations_without_improvement = 0
        max_iterations_without_improvement = 10

        for query_round in tqdm(range(n_queries), desc="Active Learning Progress"):
            logger.info(f"Query Round {query_round + 1}/{n_queries}")

            # Query new samples
            query_indices = self.query_samples(classifier, X_pool, n_instances_per_query)
            X_query = X_pool[query_indices]
            y_query = y_pool[query_indices]  # For simulation; in practice, get labels from oracle

            # Find similar samples using FAISS
            similar_indices_list = []
            for sample in X_query:
                indices = self.find_similar_samples(sample, k=k_similar)
                similar_indices_list.extend(indices)
            similar_indices = list(set(similar_indices_list))
            X_similar = X_unlabeled[similar_indices]

            # Prepare negative samples (dissimilar samples)
            all_unlabeled_indices = set(range(len(X_unlabeled)))
            negative_indices = list(all_unlabeled_indices - set(similar_indices))
            if len(negative_indices) >= len(X_query):
                negative_sample_indices = np.random.choice(negative_indices, size=len(X_query), replace=False)
            else:
                negative_sample_indices = np.random.choice(negative_indices, size=len(X_query), replace=True)
            X_negative = X_unlabeled[negative_sample_indices]

            # Train the Siamese network
            for epoch in range(num_epochs_siamese):
                embedding_model.train()
                optimizer_siamese.zero_grad()

                # Get embeddings
                embedding_anchor = embedding_model(X_query)
                embedding_positive = embedding_model(X_similar[:len(X_query)])
                embedding_negative = embedding_model(X_negative)

                # Labels
                labels_positive = torch.ones(len(embedding_anchor)).to(self.device)
                labels_negative = torch.zeros(len(embedding_anchor)).to(self.device)

                # Contrastive loss
                contrastive_loss_fn = ContrastiveLoss(margin=margin)
                loss_positive = contrastive_loss_fn(embedding_anchor, embedding_positive, labels_positive)
                loss_negative = contrastive_loss_fn(embedding_anchor, embedding_negative, labels_negative)

                loss = loss_positive + loss_negative
                loss.backward()
                optimizer_siamese.step()

                logger.info(f"Siamese Epoch [{epoch+1}/{num_epochs_siamese}], Loss: {loss.item():.4f}")

            # Update labeled dataset
            X_labeled = torch.cat([X_labeled, X_query], dim=0)
            y_labeled = torch.cat([y_labeled, y_query], dim=0)

            # Remove queried samples from X_pool and y_pool
            mask_pool = torch.ones(len(X_pool), dtype=torch.bool).to(self.device)
            mask_pool[query_indices] = False
            X_pool = X_pool[mask_pool]
            y_pool = y_pool[mask_pool]

            # Remove similar and negative samples from X_unlabeled
            used_indices = similar_indices + negative_sample_indices.tolist()
            mask_unlabeled = torch.ones(len(X_unlabeled), dtype=torch.bool).to(self.device)
            mask_unlabeled[used_indices] = False
            X_unlabeled = X_unlabeled[mask_unlabeled]

            # Rebuild FAISS index
            if len(X_unlabeled) > 0:
                self.index = faiss.IndexFlatL2(self.dimension)
                self.index.add(X_unlabeled.cpu().numpy().astype('float32'))
            else:
                logger.info("No more unlabeled data to build FAISS index.")
                break

            # Fine-tune classifier
            labeled_dataset = LabeledDataset(X_labeled, y_labeled)
            labeled_loader = DataLoader(labeled_dataset, batch_size=self.batch_size, shuffle=True)

            for epoch in range(num_epochs_classifier):
                classifier.train()
                total_loss = 0.0
                for X_batch, y_batch in labeled_loader:
                    optimizer_classifier.zero_grad()
                    outputs = classifier(X_batch)
                    supervised_loss = F.cross_entropy(outputs, y_batch)
                    supervised_loss.backward()
                    optimizer_classifier.step()
                    total_loss += supervised_loss.item()
                logger.info(f"Classifier Epoch [{epoch+1}/{num_epochs_classifier}], Loss: {total_loss/len(labeled_loader):.4f}")

            # Evaluate the model
            accuracy, f1, auc_roc = self.train_and_evaluate(classifier, self.X_test, self.y_test)
            performance_history.append((accuracy, f1, auc_roc))

            # Early stopping logic
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                iterations_without_improvement = 0
                best_model = classifier
            else:
                iterations_without_improvement += 1

            if (query_round + 1) % 5 == 0:
                logger.info(f"Iteration {query_round + 1}/{n_queries}: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

            if iterations_without_improvement >= max_iterations_without_improvement:
                logger.info(f"Early stopping at iteration {query_round + 1} due to no improvement")
                break

        end_time = time.time()
        logger.info(f"Active learning completed in {end_time - start_time:.2f} seconds")
        logger.info(f"Final performance: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

        return performance_history, best_model

    def run_pipeline(self):
        logger.info("Starting PyTorch active learning pipeline")
        start_time = time.time()

        self.prepare_data()
        performance_history, final_model = self.train_with_active_learning()

        # Evaluate final model
        final_model.eval()
        with torch.no_grad():
            outputs = final_model(self.X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_np = y_pred.cpu().numpy()
            y_test_np = self.y_test.cpu().numpy()

            cm = confusion_matrix(y_test_np, y_pred_np)
            class_report = classification_report(y_test_np, y_pred_np, target_names=self.le.classes_)

        end_time = time.time()
        logger.info(f"PyTorch Active Learning Pipeline completed in {end_time - start_time:.2f} seconds")
        logger.info("Classification Report:\n" + class_report)

        self.plot_results(performance_history)
        self.plot_confusion_matrix(cm)

        return performance_history, cm, class_report, final_model

    def plot_results(self, performance_history):
        accuracies, f1_scores, auc_rocs = zip(*performance_history)
        plt.figure(figsize=(12, 6))
        plt.plot(accuracies, label='Accuracy')
        plt.plot(f1_scores, label='F1 Score')
        plt.plot(auc_rocs, label='AUC-ROC')
        plt.xlabel('Iteration')
        plt.ylabel('Score')
        plt.title('Model Performance over Active Learning Iterations')
        plt.legend()
        plt.tight_layout()
        plt.savefig('pytorch_active_learning_results.png')
        logger.info("Results plot saved as 'pytorch_active_learning_results.png'")
        plt.close()

    def plot_confusion_matrix(self, cm):
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        plt.title('Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('pytorch_confusion_matrix.png')
        logger.info("Confusion matrix saved as 'pytorch_confusion_matrix.png'")
        plt.close()

class LabeledDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class TransformerEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_heads=4, num_layers=2):
        super(TransformerEmbedding, self).__init__()
        self.embedding_layer = nn.Linear(input_dim, embedding_dim)
        # Set batch_first=True to resolve the warning
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, x):
        # x: [batch_size, input_dim]
        x = self.embedding_layer(x)  # [batch_size, embedding_dim]
        x = x.unsqueeze(1)  # [batch_size, seq_len=1, embedding_dim]
        x = self.transformer_encoder(x)  # [batch_size, seq_len=1, embedding_dim]
        x = x.squeeze(1)  # [batch_size, embedding_dim]
        x = self.output_layer(x)
        return x

class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
    
    def forward(self, embedding1, embedding2, label):
        # label: 1 if similar, 0 if dissimilar
        euclidean_distance = F.pairwise_distance(embedding1, embedding2)
        loss = (label) * torch.pow(euclidean_distance, 2) + \
               (1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)
        return torch.mean(loss)

class SiameseClassifier(nn.Module):
    def __init__(self, embedding_model, num_classes):
        super(SiameseClassifier, self).__init__()
        self.embedding_model = embedding_model
        self.fc = nn.Linear(embedding_model.output_layer.out_features, num_classes)
    
    def forward(self, x):
        embeddings = self.embedding_model(x)
        logits = self.fc(embeddings)
        return logits

# Usage
labeled_data_file = 'sbert_data_with_distance_features.csv'
augmented_data_file = 'cleaned_proscience_da.csv'
unlabeled_data_file = r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet'

active_learning_pipeline = PyTorchActiveLearningPipeline(
    labeled_data_file=labeled_data_file,
    augmented_data_file=augmented_data_file,
    unlabeled_data_file=unlabeled_data_file,
    embedding_column="dense_embedding",
    label_column="cleaned_classification"
)

performance_history, cm, class_report, final_model = active_learning_pipeline.run_pipeline()

print("Classification Report:")
print(class_report)

print("\nConfusion Matrix:")
print(cm)

print("\nFinal Model Performance:")
print(f"Accuracy: {performance_history[-1][0]:.4f}")
print(f"F1 Score: {performance_history[-1][1]:.4f}")
print(f"AUC-ROC: {performance_history[-1][2]:.4f}")


2024-09-25 17:49:34,600 - INFO - Initializing PyTorchActiveLearningPipeline
2024-09-25 17:49:34,601 - INFO - Labeled data file: sbert_data_with_distance_features.csv
2024-09-25 17:49:34,602 - INFO - Augmented data file: cleaned_proscience_da.csv
2024-09-25 17:49:34,603 - INFO - Unlabeled data file: C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet
2024-09-25 17:49:34,603 - INFO - Embedding column: dense_embedding
2024-09-25 17:49:34,604 - INFO - Label column: cleaned_classification
2024-09-25 17:49:34,605 - INFO - Test size: 0.2
2024-09-25 17:49:34,605 - INFO - Initial labeled ratio: 0.3
2024-09-25 17:49:34,606 - INFO - Random state: 42
2024-09-25 17:49:34,606 - INFO - Batch size: 64
2024-09-25 17:49:34,607 - INFO - Loading and preprocessing data...
2024-09-25 17:49:47,018 - INFO - Labeled data shape: (79763, 13)
2024-09-25 17:49:48,036 - INFO - Augmented data shape: (5378, 4)
2024-09-25 17:49:50,473 - INFO - Unlabeled data shape: (412879, 384)
2024-09-25 17:49

Active Learning Progress:   0%|          | 0/50 [00:00<?, ?it/s]

2024-09-25 17:50:11,397 - INFO - Query Round 1/50
2024-09-25 17:50:15,637 - INFO - Siamese Epoch [1/5], Loss: 44.3206
2024-09-25 17:50:15,690 - INFO - Siamese Epoch [2/5], Loss: 7.1722
2024-09-25 17:50:15,740 - INFO - Siamese Epoch [3/5], Loss: 4.3237
2024-09-25 17:50:15,789 - INFO - Siamese Epoch [4/5], Loss: 4.0270
2024-09-25 17:50:15,839 - INFO - Siamese Epoch [5/5], Loss: 4.2342
2024-09-25 17:50:25,007 - INFO - Classifier Epoch [1/5], Loss: 0.6931
2024-09-25 17:50:33,234 - INFO - Classifier Epoch [2/5], Loss: 0.6346
2024-09-25 17:50:41,507 - INFO - Classifier Epoch [3/5], Loss: 0.6152
2024-09-25 17:50:52,024 - INFO - Classifier Epoch [4/5], Loss: 0.6030
2024-09-25 17:51:03,410 - INFO - Classifier Epoch [5/5], Loss: 0.5908
2024-09-25 17:51:03,656 - INFO - Query Round 2/50
2024-09-25 17:51:08,199 - INFO - Siamese Epoch [1/5], Loss: 33.9851
2024-09-25 17:51:08,258 - INFO - Siamese Epoch [2/5], Loss: 24.3450
2024-09-25 17:51:08,314 - INFO - Siamese Epoch [3/5], Loss: 17.1783
2024-09-25

KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np
import faiss
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import logging
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
import random

# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PyTorchActiveLearningPipeline:
    def __init__(self, labeled_data_file: str, augmented_data_file: str, unlabeled_data_file: str, 
                 embedding_column: str, label_column: str, test_size: float = 0.2, 
                 initial_labeled_ratio: float = 0.3, random_state: int = 42, batch_size: int = 64):
        self.labeled_data_file = labeled_data_file
        self.augmented_data_file = augmented_data_file
        self.unlabeled_data_file = unlabeled_data_file
        self.embedding_column = embedding_column
        self.label_column = label_column
        self.test_size = test_size
        self.initial_labeled_ratio = initial_labeled_ratio
        self.random_state = random_state
        self.batch_size = batch_size
        self.le = LabelEncoder()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self._log_init_info()
        self._load_and_preprocess_data()
        
    def _log_init_info(self) -> None:
        logger.info("Initializing PyTorchActiveLearningPipeline")
        logger.info(f"Labeled data file: {self.labeled_data_file}")
        logger.info(f"Augmented data file: {self.augmented_data_file}")
        logger.info(f"Unlabeled data file: {self.unlabeled_data_file}")
        logger.info(f"Embedding column: {self.embedding_column}")
        logger.info(f"Label column: {self.label_column}")
        logger.info(f"Test size: {self.test_size}")
        logger.info(f"Initial labeled ratio: {self.initial_labeled_ratio}")
        logger.info(f"Random state: {self.random_state}")
        logger.info(f"Batch size: {self.batch_size}")

    def _process_embeddings(self, df):
        def process_embedding(x):
            if isinstance(x, str):
                x = x.strip('[]')
                try:
                    return np.array([float(i) for i in x.split(',')])
                except ValueError:
                    return np.array([float(i) for i in x.split()])
            elif isinstance(x, np.ndarray):
                return x
            else:
                raise ValueError(f"Unexpected embedding format: {type(x)}")

        df[self.embedding_column] = df[self.embedding_column].apply(process_embedding)
        return df

    def _load_and_preprocess_data(self) -> None:
        start_time = time.time()
        logger.info("Loading and preprocessing data...")
        
        # Load labeled data
        self.labeled_data = pd.read_csv(self.labeled_data_file)
        self.labeled_data = self._process_embeddings(self.labeled_data)
        logger.info(f"Labeled data shape: {self.labeled_data.shape}")
        
        # Load augmented data
        self.augmented_data = pd.read_csv(self.augmented_data_file)
        self.augmented_data = self._process_embeddings(self.augmented_data)
        logger.info(f"Augmented data shape: {self.augmented_data.shape}")
        
        all_data = pd.concat([self.labeled_data, self.augmented_data], ignore_index=True)
        
        self.le.fit(all_data[self.label_column])
        self.labeled_data['encoded_label'] = self.le.transform(self.labeled_data[self.label_column])
        self.augmented_data['encoded_label'] = self.le.transform(self.augmented_data[self.label_column])
        
        # Load unlabeled data
        self.dimension = len(self.labeled_data[self.embedding_column].iloc[0])
        self.unlabeled_data = []
        parquet_file = pq.ParquetFile(self.unlabeled_data_file)
        
        # Read in batches to manage memory
        for batch in parquet_file.iter_batches(batch_size=10000, columns=[self.embedding_column]):
            df_chunk = batch.to_pandas()
            df_chunk = self._process_embeddings(df_chunk)
            embeddings = np.stack(df_chunk[self.embedding_column].values)
            self.unlabeled_data.append(embeddings)
        
        self.unlabeled_data = np.vstack(self.unlabeled_data)
        logger.info(f"Unlabeled data shape: {self.unlabeled_data.shape}")
        
        # Build FAISS index
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(self.unlabeled_data.astype('float32'))
        logger.info(f"FAISS index size: {self.index.ntotal}")
        
        end_time = time.time()
        logger.info(f"Data loaded and preprocessed in {end_time - start_time:.2f} seconds")

    def prepare_data(self):
        logger.info("Preparing data for active learning")
        
        X = np.vstack([
            np.stack(self.labeled_data[self.embedding_column].values),
            np.stack(self.augmented_data[self.embedding_column].values)
        ])
        y = np.concatenate([
            self.labeled_data['encoded_label'].values,
            self.augmented_data['encoded_label'].values
        ])
        
        # Ensure that the initial labeled set contains samples from all classes
        classes = np.unique(y)
        initial_indices = []
        for cls in classes:
            idx = np.where(y == cls)[0]
            initial_indices.append(idx[0])  # Take the first instance of each class

        remaining_indices = [i for i in range(len(y)) if i not in initial_indices]

        X_initial = X[initial_indices]
        y_initial = y[initial_indices]

        # Now sample the rest of the initial data
        additional_initial_size = int(self.initial_labeled_ratio * len(X)) - len(initial_indices)
        X_remaining = X[remaining_indices]
        y_remaining = y[remaining_indices]

        X_additional_initial, X_pool, y_additional_initial, y_pool = train_test_split(
            X_remaining, y_remaining, train_size=additional_initial_size,
            random_state=self.random_state, stratify=y_remaining
        )

        X_initial = np.vstack([X_initial, X_additional_initial])
        y_initial = np.concatenate([y_initial, y_additional_initial])

        # Split the pool into pool and test sets
        X_pool, X_test, y_pool, y_test = train_test_split(
            X_pool, y_pool, test_size=self.test_size, random_state=self.random_state, stratify=y_pool
        )

        logger.info(f"Initial labeled set shape: {X_initial.shape}")
        logger.info(f"Pool set shape: {X_pool.shape}")
        logger.info(f"Test set shape: {X_test.shape}")
        
        # Convert to PyTorch tensors
        self.X_initial = torch.tensor(X_initial, dtype=torch.float32).to(self.device)
        self.y_initial = torch.tensor(y_initial, dtype=torch.long).to(self.device)
        self.X_pool = torch.tensor(X_pool, dtype=torch.float32).to(self.device)
        self.y_pool = torch.tensor(y_pool, dtype=torch.long).to(self.device)  # For simulation
        self.X_test = torch.tensor(X_test, dtype=torch.float32).to(self.device)
        self.y_test = torch.tensor(y_test, dtype=torch.long).to(self.device)

    def train_and_evaluate(self, classifier, X_test, y_test):
        classifier.eval()
        with torch.no_grad():
            outputs = classifier(X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_proba = F.softmax(outputs, dim=1)

            y_test_np = y_test.cpu().numpy()
            y_pred_np = y_pred.cpu().numpy()
            y_pred_proba_np = y_pred_proba.cpu().numpy()

            accuracy = accuracy_score(y_test_np, y_pred_np)
            f1 = f1_score(y_test_np, y_pred_np, average='macro')
            auc_roc = roc_auc_score(y_test_np, y_pred_proba_np, multi_class='ovr')

        return accuracy, f1, auc_roc

    def get_uncertainty_scores(self, classifier, X_pool):
        classifier.eval()
        with torch.no_grad():
            outputs = classifier(X_pool)
            probabilities = F.softmax(outputs, dim=1)
            entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-5), dim=1)
        return entropy

    def query_samples(self, classifier, X_pool, n_instances):
        uncertainties = self.get_uncertainty_scores(classifier, X_pool)
        _, query_indices = torch.topk(uncertainties, n_instances)
        return query_indices

    def find_similar_embeddings(self, embeddings, k=5):
        embeddings_np = embeddings.cpu().numpy().astype('float32')
        distances, indices = self.index.search(embeddings_np, k)
        return indices  # Returns indices of similar embeddings

    def train_with_active_learning(self, n_queries=50, n_instances_per_query=100, k_similar=5):
        logger.info("Starting active learning with Uncertainty Sampling, FAISS, and Consistency Regularization")
        start_time = time.time()

        input_dim = self.X_initial.shape[1]  # Embedding dimension
        num_classes = len(self.le.classes_)
        num_epochs = 10
        lambda_u = 0.1  # Weight for unsupervised loss
        noise_std = 0.1  # Standard deviation for Gaussian noise

        # Initialize classifier
        classifier = SimpleFeedforwardClassifier(input_dim, num_classes).to(self.device)

        # Handle class imbalance
        class_counts = np.bincount(self.y_initial.cpu().numpy())
        class_weights = 1.0 / class_counts
        class_weights = class_weights / class_weights.sum()
        class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)

        optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)

        # Initialize labeled dataset
        X_labeled = self.X_initial.clone()
        y_labeled = self.y_initial.clone()

        X_pool = self.X_pool.clone()
        y_pool = self.y_pool.clone()

        performance_history = []
        best_accuracy = 0
        iterations_without_improvement = 0
        max_iterations_without_improvement = 10

        for query_round in tqdm(range(n_queries), desc="Active Learning Progress"):
            logger.info(f"Query Round {query_round + 1}/{n_queries}")

            # Train classifier
            labeled_dataset = LabeledDataset(X_labeled, y_labeled)
            labeled_loader = DataLoader(labeled_dataset, batch_size=self.batch_size, shuffle=True)

            classifier.train()
            for epoch in range(num_epochs):
                total_loss = 0.0
                for X_batch, y_batch in labeled_loader:
                    optimizer.zero_grad()
                    outputs = classifier(X_batch)
                    supervised_loss = criterion(outputs, y_batch)
                    
                    # Consistency Regularization
                    # Sample a batch of unlabeled data similar to X_batch
                    with torch.no_grad():
                        # For each sample in X_batch, find similar embeddings
                        similar_indices = self.find_similar_embeddings(X_batch, k=k_similar)
                        similar_embeddings = self.unlabeled_data[similar_indices.flatten()]
                        # Convert to tensor
                        X_unlabeled_batch = torch.tensor(similar_embeddings, dtype=torch.float32).to(self.device)
                    
                    # Apply noise to embeddings
                    noise = torch.randn_like(X_unlabeled_batch) * noise_std
                    X_unlabeled_aug = X_unlabeled_batch + noise

                    # Compute consistency loss
                    with torch.no_grad():
                        preds_unlabeled = classifier(X_unlabeled_batch)
                    preds_unlabeled_aug = classifier(X_unlabeled_aug)
                    consistency_loss = F.mse_loss(preds_unlabeled_aug, preds_unlabeled)

                    # Total loss
                    loss = supervised_loss + lambda_u * consistency_loss
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                # Optionally log training loss

            # Evaluate the model
            accuracy, f1, auc_roc = self.train_and_evaluate(classifier, self.X_test, self.y_test)
            performance_history.append((accuracy, f1, auc_roc))

            # Early stopping logic
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                iterations_without_improvement = 0
                best_model = classifier
            else:
                iterations_without_improvement += 1

            if (query_round + 1) % 5 == 0:
                logger.info(f"Iteration {query_round + 1}/{n_queries}: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

            if iterations_without_improvement >= max_iterations_without_improvement:
                logger.info(f"Early stopping at iteration {query_round + 1} due to no improvement")
                break

            # Query new samples using Uncertainty Sampling
            n_instances = min(n_instances_per_query, len(X_pool))
            if n_instances == 0:
                logger.info("No more samples to query from the pool.")
                break

            query_indices = self.query_samples(classifier, X_pool, n_instances)
            X_query = X_pool[query_indices]
            y_query = y_pool[query_indices]  # For simulation; in practice, get labels from oracle

            # Update labeled dataset with queried samples
            X_labeled = torch.cat([X_labeled, X_query], dim=0)
            y_labeled = torch.cat([y_labeled, y_query], dim=0)

            # Remove queried samples from X_pool and y_pool
            mask_pool = torch.ones(len(X_pool), dtype=torch.bool).to(self.device)
            mask_pool[query_indices] = False
            X_pool = X_pool[mask_pool]
            y_pool = y_pool[mask_pool]

            # Update class weights
            class_counts = np.bincount(y_labeled.cpu().numpy(), minlength=num_classes)
            class_weights = 1.0 / (class_counts + 1e-6)  # Add small value to avoid division by zero
            class_weights = class_weights / class_weights.sum()
            class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
            criterion = nn.CrossEntropyLoss(weight=class_weights)

        end_time = time.time()
        logger.info(f"Active learning completed in {end_time - start_time:.2f} seconds")
        logger.info(f"Final performance: Accuracy = {accuracy:.4f}, F1 = {f1:.4f}, AUC-ROC = {auc_roc:.4f}")

        return performance_history, best_model

    def run_pipeline(self):
        logger.info("Starting PyTorch active learning pipeline")
        start_time = time.time()

        self.prepare_data()
        performance_history, final_model = self.train_with_active_learning()

        # Evaluate final model
        final_model.eval()
        with torch.no_grad():
            outputs = final_model(self.X_test)
            _, y_pred = torch.max(outputs, 1)
            y_pred_np = y_pred.cpu().numpy()
            y_test_np = self.y_test.cpu().numpy()

            cm = confusion_matrix(y_test_np, y_pred_np)
            class_report = classification_report(y_test_np, y_pred_np, target_names=self.le.classes_)

        end_time = time.time()
        logger.info(f"PyTorch Active Learning Pipeline completed in {end_time - start_time:.2f} seconds")
        logger.info("Classification Report:\n" + class_report)

        self.plot_results(performance_history)
        self.plot_confusion_matrix(cm)

        return performance_history, cm, class_report, final_model

    def plot_results(self, performance_history):
        accuracies, f1_scores, auc_rocs = zip(*performance_history)
        plt.figure(figsize=(12, 6))
        plt.plot(accuracies, label='Accuracy')
        plt.plot(f1_scores, label='F1 Score')
        plt.plot(auc_rocs, label='AUC-ROC')
        plt.xlabel('Iteration')
        plt.ylabel('Score')
        plt.title('Model Performance over Active Learning Iterations')
        plt.legend()
        plt.tight_layout()
        plt.savefig('pytorch_active_learning_results.png')
        logger.info("Results plot saved as 'pytorch_active_learning_results.png'")
        plt.close()

    def plot_confusion_matrix(self, cm):
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        plt.title('Confusion Matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        plt.savefig('pytorch_confusion_matrix.png')
        logger.info("Confusion matrix saved as 'pytorch_confusion_matrix.png'")
        plt.close()

class LabeledDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class SimpleFeedforwardClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleFeedforwardClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

# Usage
labeled_data_file = 'sbert_data_with_distance_features.csv'
augmented_data_file = 'cleaned_proscience_da.csv'
unlabeled_data_file = r'C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet'

active_learning_pipeline = PyTorchActiveLearningPipeline(
    labeled_data_file=labeled_data_file,
    augmented_data_file=augmented_data_file,
    unlabeled_data_file=unlabeled_data_file,
    embedding_column="dense_embedding",
    label_column="cleaned_classification"
)

performance_history, cm, class_report, final_model = active_learning_pipeline.run_pipeline()

print("Classification Report:")
print(class_report)

print("\nConfusion Matrix:")
print(cm)

print("\nFinal Model Performance:")
print(f"Accuracy: {performance_history[-1][0]:.4f}")
print(f"F1 Score: {performance_history[-1][1]:.4f}")
print(f"AUC-ROC: {performance_history[-1][2]:.4f}")


2024-09-25 18:38:25,801 - INFO - Initializing PyTorchActiveLearningPipeline
2024-09-25 18:38:25,802 - INFO - Labeled data file: sbert_data_with_distance_features.csv
2024-09-25 18:38:25,802 - INFO - Augmented data file: cleaned_proscience_da.csv
2024-09-25 18:38:25,803 - INFO - Unlabeled data file: C:\Users\nrosso\Documents\thesis_project\data\processed\embeddings.parquet
2024-09-25 18:38:25,803 - INFO - Embedding column: dense_embedding
2024-09-25 18:38:25,804 - INFO - Label column: cleaned_classification
2024-09-25 18:38:25,804 - INFO - Test size: 0.2
2024-09-25 18:38:25,805 - INFO - Initial labeled ratio: 0.3
2024-09-25 18:38:25,806 - INFO - Random state: 42
2024-09-25 18:38:25,807 - INFO - Batch size: 64
2024-09-25 18:38:25,808 - INFO - Loading and preprocessing data...
2024-09-25 18:38:38,054 - INFO - Labeled data shape: (79763, 13)
2024-09-25 18:38:39,048 - INFO - Augmented data shape: (5378, 4)
2024-09-25 18:38:41,543 - INFO - Unlabeled data shape: (412879, 384)
2024-09-25 18:38

Active Learning Progress:   0%|          | 0/50 [00:00<?, ?it/s]

2024-09-25 18:39:00,867 - INFO - Query Round 1/50


KeyboardInterrupt: 