## SSSD: Semantic Search Stance Detection

In [3]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm 
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
# Define the path of your files in local file system
query_set_path = './data/processed/vaccination/query_set.csv'
test_set_path = './data/processed/vaccination/test_set.csv'
domain_set_path = './data/processed/vaccination/domain_set.csv'

# Loading a manually labeled dataset containing representative tweets
# of different stances on vaccination. This set is useful for training and
# evaluating natural language processing models, providing concrete examples
# of how different opinions are expressed on Twitter.
query_set = pd.read_csv(query_set_path)

# Loading a test dataset, also manually labeled, used
# to test the generalization and effectiveness of natural language processing models.
# This set helps evaluate how the model performs in relation to new
# data and examples that were not seen during training.
test_set = pd.read_csv(test_set_path)

# The domain_set is presumed to be a larger set of tweets collected using hashttags for analysis.
domain_set = pd.read_csv(domain_set_path)

# Removing texts from the combined DataFrame that are present in the query_set or test_set,
# ensuring the uniqueness and integrity of the domain_set.
# This step is crucial to maintain the separation between training, testing, and domain-specific datasets.
domain_set = domain_set[~domain_set.text.isin(query_set.text)]
domain_set = domain_set[~domain_set.text.isin(test_set.text)]
domain_set.reset_index(drop=True, inplace=True)

## 1 - Semantic Labeling

In [None]:
def semantic_labeling(domain_set, ptm, domain_set_embeddings, query_set, top_k):
    """
    Function for semantic labeling of a corpus using a pre-trained Transformer model (PTM).

    Args:
        domain_set (DataFrame): A DataFrame containing the data to be labeled. Must have 'id' and 'text' columns.
        ptm (SentenceTransformer): The pre-trained Transformer model used to generate embeddings.
        domain_set_embeddings (Tensor): Embeddings of the domain set generated by the PTM.
        query_set (DataFrame): A DataFrame containing reference queries and their associated labels.
        top_k (int): The number of top results to be considered for each query.

    Returns:
        DataFrame: A new DataFrame with the texts of the corpus labeled based on semantic similarity
        to the queries in query_set.

    Process:
        1. For each query in the query_set, compute its embedding using the PTM.
        2. Calculate the cosine similarity between the query embedding and the embeddings of the domain_set.
        3. Select the top_k results based on the highest similarity.
        4. For each of the top results, create a dictionary containing 'id', 'query', 'text', 'label', 
           and 'score' (cosine similarity).
        5. Add each dictionary to a list.
        6. Convert the list of dictionaries into a DataFrame and return it.
    """
    k = min(top_k, len(domain_set))
    labeled_list = []

    for query in query_set.values:
        # Generate embeddings for the query using the PTM.
        query_embeddings = ptm.encode(query[0], convert_to_tensor=True)
        # Calculate the cosine similarity between the query and the domain set.
        cos_scores = util.cos_sim(query_embeddings, domain_set_embeddings)[0]
        # Select the top_k results based on the scores.
        top_results = torch.topk(cos_scores, k=k)

        for score, idx in zip(top_results[0], top_results[1]):
            # Create a dictionary with the relevant details and add it to the list.
            docs = {
                "id": domain_set.id[idx.cpu().detach().numpy()],
                "query": query[0],
                "text": domain_set.text[idx.cpu().detach().numpy()],
                "label": query[1],
                "score": np.round(score.cpu().detach().numpy(), 2)
            }
            labeled_list.append(docs)

    # Convert the list of dictionaries into a DataFrame and return it.
    return pd.DataFrame(labeled_list)


In [None]:
# Load the Sentence Transformer model for generating embeddings, using 'all-MiniLM-L6-v2'.
# The model is configured to use GPU ('cuda') for accelerated processing.
ptm = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

# Generate embeddings for the 'domain_set' dataset using the loaded model.
# Conversion to PyTorch tensor is enabled for improved performance in subsequent operations.
domain_set['text'] = domain_set.text.astype('str')
domain_set_embeddings = ptm.encode(domain_set.text, convert_to_tensor=True)

# Loop to process semantic labeling for different sizes of top results (k).
# The tqdm progress bar is used to track progress over the range of k values.
for k in tqdm(range(5, 105, 5), desc="Do queries with k values"):
    # Generates a semantically labeled dataset based on 'k' top results.
    k_tweets = semantic_labeling(domain_set, ptm, domain_set_embeddings, query_set, k)

    # Filter to keep only entries with a cosine max similarity score threshold  <= 0.95.
    # This helps to avoid including highly similar results that could bias the model.
    k_tweets = k_tweets[k_tweets.score <= 0.95]

    # Saves the augmented dataset in a CSV file.
    # The file name includes the value of 'k' for easy reference.
    k_tweets.to_csv(f'./data/labeled/vaccination/{k}.csv', index=False)

## 2 - Stance Detection

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm  # Import tqdm for progress bar

# Initialize parameters for the TF-IDF vectorizer.
NGRAM_RANGE = (1, 2)
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2
kwargs = {
    'ngram_range': NGRAM_RANGE,
    'strip_accents': 'unicode',
    'decode_error': 'replace',
    'analyzer': TOKEN_MODE,
    'min_df': MIN_DOCUMENT_FREQUENCY,
}

# DataFrame to accumulate classification reports.
all_metrics = pd.DataFrame()

# Loop over different values of k.
for k in tqdm(range(5, 105, 5), desc="Training SD Models"):
    # Load the training set for the current value of k.
    training_set = pd.read_csv(f'./data/labeled/vaccination/{k}.csv')
    training_set['subset'] = 'train'
    training_set.reset_index(drop=True, inplace=True)

    # Prepare the test set.
    test_set['subset'] = 'test'
    test_set = test_set[['text', 'label', 'subset']]
    test_set.reset_index(drop=True, inplace=True)

    # Concatenate the training and test sets.
    train_test_set = pd.concat([training_set, test_set])
    train_test_set.reset_index(drop=True, inplace=True)

    # Vectorize the data.
    vectorizer = TfidfVectorizer(**kwargs)
    X = vectorizer.fit_transform(train_test_set.text)

    # Select the training and test subsets.
    X_train = X[train_test_set[train_test_set.subset == 'train'].index]
    y_train = train_test_set[train_test_set.subset == 'train'].label

    X_test = X[train_test_set[train_test_set.subset == 'test'].index]
    y_test = train_test_set[train_test_set.subset == 'test'].label

    # Train and predict with the model.
    model = LogisticRegression(class_weight="balanced", solver="liblinear")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Generate the classification report (MLFlow recommended for logging).
    clf_report = classification_report(y_test, y_pred, output_dict=True)
    report = pd.DataFrame(clf_report).transpose()

    # Create a temporary DataFrame to store the metrics for the current k value.
    metrics = pd.DataFrame({
        'k': [k],
        'precision_favor': [report.iloc[0]['precision']],
        'precision_against': [report.iloc[1]['precision']],
        'precision_none': [report.iloc[2]['precision']],
        'recall_favor': [report.iloc[0]['recall']],
        'recall_against': [report.iloc[1]['recall']],
        'recall_none': [report.iloc[2]['recall']],
        'f1_against': [report.iloc[0]['f1-score']],
        'f1_favor': [report.iloc[1]['f1-score']],
        'f1_none': [report.iloc[2]['f1-score']],
        'f1_macro_avg': [report.iloc[4]['f1-score']],
        'f1_weighted_avg': [report.iloc[5]['f1-score']],
        'semeval_macro_f1': [(report.iloc[0]['f1-score'] + report.iloc[1]['f1-score']) / 2]
    })

    # Accumulate the current report in the all_reports DataFrame.
    all_metrics = pd.concat([all_metrics, metrics])

In [None]:
all_metrics.sort_values("semeval_macro_f1", ascending=False)