In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

## Import Data

In [2]:
# Get the data directory
data_dir = os.path.join(os.getcwd(), "scenario_c_data")

# List all directories in the current directory
topics = [d for d in os.listdir(data_dir) 
                   if os.path.isdir(os.path.join(data_dir, d)) and d != '.ipynb_checkpoints'
              ]

print(topics)


['business', 'entertainment', 'politics', 'sport', 'tech']


In [3]:
# Create a dictionary mapping each topic to its subdirectories
text_dir_dict = {
    topic: [d for d in os.listdir(os.path.join(data_dir, topic))]
    for topic in topics
}

In [4]:
text_dict = {}

for topic in topics:
    # Initialize a dictionary to store file contents for each topic
    text_dict[topic] = []
    
    for filename in text_dir_dict[topic]:
        # Construct the full file path using os.path.join
        file_path = os.path.join(data_dir, topic, filename)
        
        # Check if it's a directory (e.g., '.ipynb_checkpoints') and skip it
        if os.path.isdir(file_path):
            continue
            
        # Open the file and read its content as a string
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
        text_dict[topic].append(content)
        
for topic in text_dict:
    print(f"{topic}: {len(text_dict[topic])}")
        

business: 510
entertainment: 386
politics: 417
sport: 511
tech: 401


## Preprocessing

In [5]:
# Preprocessing function to clean text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove unnecessary characters (basic cleaning)
    text = text.replace("\n", " ")
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)  # Removes URLs
    
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)  # Removes hashtags
    
    # Remove punctuation and special characters
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove extra spaces (including leading and trailing whitespace)
    text = ' '.join(text.split())
    
    return text

In [6]:
# Preprocess the articles in text_dict
for topic in text_dict:
    text_dict[topic] = [preprocess_text(article) for article in text_dict[topic]]
    
# Create a flat list of all articles across topics
all_articles = []
article_labels = []

for topic in text_dict:
    for article in text_dict[topic]:
        all_articles.append(article)
        article_labels.append(topic)  # Keep track of which topic each article belongs to

unique_articles = []
seen_articles = set()

for article, label in zip(all_articles, article_labels):
    if article not in seen_articles:  # Check if the article is a duplicate
        unique_articles.append(article)
        seen_articles.add(article)
        

## TF-IDF

In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

def get_tfidf(text):
    return tfidf_vectorizer.transform([preprocess_text(text)])
    
# Fit and transform all articles to get their TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(unique_articles)

## bert-base-uncased

In [8]:
# Load BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = bert_model(**inputs)
        
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
# Encode all articles using BERT with progress tracking
bert_embeddings = np.array([get_bert(article) for article in tqdm(unique_articles, desc="Encoding articles", unit="article")])


Encoding articles: 100%|██████████| 2122/2122 [11:02<00:00,  3.20article/s]


## gte-base-en-v1.5

In [9]:
#NOTE: Requires transformers>=4.36.0

# Load GTE model and tokenizer
gte_tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5')
gte_model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

# Function to get GTE embeddings
def get_gte(text):
    inputs = gte_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=8192)
    
    with torch.no_grad():
        outputs = gte_model(**inputs)
        
    # Use the mean of last_hidden_state across tokens for embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    
    # Optionally normalize embeddings
    return F.normalize(embeddings, p=2, dim=0).numpy()

# Encode all articles using GTE with progress tracking
gte_embeddings = np.array([get_gte(article) for article in unique_articles])

## bge-large-zh-v1.5

In [10]:
# Load the Sentence Transformer model
bge_model = SentenceTransformer('BAAI/bge-large-zh-v1.5')

def get_bge(text_list):
    return bge_model.encode(text_list, normalize_embeddings=True)

bge_embeddings = get_bge(unique_articles)

  return self.fget.__get__(instance, owner)()


## e5-base-v2

In [12]:
# Initialize the tokenizer and model
e5_tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
e5_model = AutoModel.from_pretrained('intfloat/e5-base-v2')

def average_pool(last_hidden_states, attention_mask):
    """
    Applies average pooling to the last hidden states of the model.
    Args:
        last_hidden_states (Tensor): The output from the model containing hidden states.
        attention_mask (Tensor): The attention mask to identify real tokens.
    Returns:
        Tensor: The average pooled embeddings.
    """
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def get_e5(text):
    """
    Generates the embeddings for a single text input (query or passage).
    Args:
        text (str or list): Input text or list of texts.
    Returns:
        Tensor: Normalized embeddings.
    """
    inputs = e5_tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = e5_model(**inputs)
    
    # Apply average pooling
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
    
    # Normalize embeddings
    return F.normalize(embeddings, p=2, dim=1)

e5_embeddings = get_e5(unique_articles)

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 26700939264 bytes.

## Ensembling Outputs

In [45]:
article_embedding_dict = {
    "TF-IDF": tfidf_matrix,
    "bert-base-uncased": bert_embeddings,
    "gte-base-en-v1.5": gte_embeddings,
    "bge-large-zh-v1.5": bge_embeddings
    # "e5-base-v2": e5_embeddings
}

def cosine_similiarity_search(query_embedding, article_embeddings, unique_articles):
    # Compute cosine similarity between the query and all articles
    similarities = cosine_similarity(query_embedding, article_embeddings).flatten()
    
    # Get the indices of the top 5 most similar articles
    top_indices = np.argsort(similarities)[-5:][::-1]
    
    # Create a DataFrame to hold the results
    results_df = pd.DataFrame({
        'Article': np.array(unique_articles)[top_indices],
        'Similarity Score': similarities[top_indices]
    })
    
    return results_df

def input_output_pipeline(query_embedding_dict, embedding_dict, all_articles):
    output_df = pd.DataFrame(columns=["model", "similarity_score", "article"])
    
    for model in tqdm(query_embedding_dict.keys()):
        similarity_df = cosine_similiarity_search(
                            query_embedding_dict[model],
                            article_embedding_dict[model],
                            all_articles
                        )
        
        best_row = pd.DataFrame({
                        "model": [model],
                        "similarity_score": [similarity_df["Similarity Score"].iloc[0]],
                        "article": [similarity_df["Article"].iloc[0]]
                    })
        
        output_df = pd.concat([output_df, best_row])
        
    if output_df['article'].value_counts().max() > 1:
        return output_df['article'].value_counts().idxmax()
    else:
        # Find the highest similarity score
        highest_score = max(output["similarity_score"])
        
        # Get the article with the highest similarity score
        return output[output["similarity_score"] == highest_score].iloc[0]["article"]


## Validating on "London meeting of finance ministers and central bankers"

In [46]:
# Example search query
query = "London meeting of finance ministers and central bankers"

query_embedding_dict = {
    "TF-IDF": get_tfidf(query),
    "bert-base-uncased": [get_bert(query)],
    "gte-base-en-v1.5": [get_gte(query)],
    "bge-large-zh-v1.5": get_bge([query]),
    # "e5-base-v2": get_e5(query)
}

output = input_output_pipeline(query_embedding_dict, article_embedding_dict, unique_articles)
output

100%|██████████| 4/4 [00:00<00:00, 18.89it/s]


'tsunami debt deal to be announced chancellor gordon brown has said he hopes to announce a deal to suspend debt interest repayments by tsunamihit nations later on friday the agreement by the g8 group of wealthy nations would save affected countries £3bn pounds a year he said the deal is thought to have been hammered out on thursday night after japan one of the biggest creditor nations finally signed up to it mr brown first proposed the idea earlier this week g8 ministers are also believed to have agreed to instruct the world bank and the international monetary fund to complete a country by country analysis of the reconstruction problems faced by all states hit by the disaster mr brown has been locked in talks with finance ministers of the g8 which britain now chairs germany also proposed a freeze and canada has begun its own moratorium the expected deal comes as foreign secretary jack straw said the number of britons dead or missing in the disaster have reached 440'

## Validating on "Amit Yoran leaves the Department of# Example search query
query = "Amit Yoran leaves the Department of Homeland Security" Homeland Security"

In [47]:
# Example search query
query = "Amit Yoran leaves the Department of Homeland Security"

query_embedding_dict = {
    "TF-IDF": get_tfidf(query),
    "bert-base-uncased": [get_bert(query)],
    "gte-base-en-v1.5": [get_gte(query)],
    "bge-large-zh-v1.5": get_bge([query]),
    # "e5-base-v2": get_e5(query)
}

output = input_output_pipeline(query_embedding_dict, article_embedding_dict, unique_articles)
output

100%|██████████| 4/4 [00:00<00:00, 19.12it/s]


