In [58]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Import Data

In [59]:
import os

# Get the data directory
data_dir = os.path.join(os.getcwd(), "scenario_c_data")

# List all directories in the current directory
topics = [d for d in os.listdir(data_dir) 
                   if os.path.isdir(os.path.join(data_dir, d)) and d != '.ipynb_checkpoints'
              ]

print(topics)


['business', 'entertainment', 'politics', 'sport', 'tech']


In [60]:
# Create a dictionary mapping each topic to its subdirectories
text_dir_dict = {
    topic: [d for d in os.listdir(os.path.join(data_dir, topic))]
    for topic in topics
}

In [61]:
text_dict = {}

for topic in topics:
    # Initialize a dictionary to store file contents for each topic
    text_dict[topic] = []
    
    for filename in text_dir_dict[topic]:
        # Construct the full file path using os.path.join
        file_path = os.path.join(data_dir, topic, filename)
        
        # Check if it's a directory (e.g., '.ipynb_checkpoints') and skip it
        if os.path.isdir(file_path):
            continue
            
        # Open the file and read its content as a string
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
        text_dict[topic].append(content)
        
for topic in text_dict:
    print(f"{topic}: {len(text_dict[topic])}")
        

business: 510
entertainment: 386
politics: 417
sport: 511
tech: 401


## Preprocessing

In [62]:
# Preprocessing function to clean text
def preprocess_text(text):
    # Convert to lowercase and remove any unnecessary characters (basic cleaning)
    return text.lower().replace("\n", " ").strip()

# Preprocess the articles in text_dict
for topic in text_dict:
    text_dict[topic] = [preprocess_text(article) for article in text_dict[topic]]
    
# Create a flat list of all articles across topics
all_articles = []
article_labels = []

for topic in text_dict:
    for article in text_dict[topic]:
        all_articles.append(article)
        article_labels.append(topic)  # Keep track of which topic each article belongs to

unique_articles = []
unique_labels = []
seen_articles = set()

for article, label in zip(all_articles, article_labels):
    if article not in seen_articles:  # Check if the article is a duplicate
        unique_articles.append(article)
        unique_labels.append(label)
        seen_articles.add(article)

## TF-IDF Vectorizing

In [63]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

def get_tfidf(text):
    return tfidf_vectorizer.transform([preprocess_text(text)])
    
def search_similar_articles_tfidf(query, tfidf_matrix, unique_articles):
    # Tokenize query into tf-idf
    query_tfidf = get_tfidf(query)
    
    # Compute cosine similarity between the query and all articles
    similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    
    # Get the indices of the top 5 most similar articles
    top_indices = np.argsort(similarities)[-5:][::-1]
    
    # Create a DataFrame to hold the results
    results_df = pd.DataFrame({
        'Article': np.array(all_articles)[top_indices],
        'Similarity Score': similarities[top_indices]
    })
    
    return results_df

# Fit and transform all articles to get their TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(unique_articles)

## BERT Embedding

In [7]:
# Load BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to search similar articles using BERT
def search_similar_articles_bert(query, article_embeddings, all_articles):
    query_embedding = get_bert_embedding(query)
    similarities = cosine_similarity([query_embedding], article_embeddings).flatten()
    
    # Get the indices of the top 5 most similar articles
    top_indices = np.argsort(similarities)[-5:][::-1]
    
    # Create a DataFrame to hold the results
    results_df = pd.DataFrame({
        'Article': np.array(all_articles)[top_indices],
        'Similarity Score': similarities[top_indices]
    })
    
    return results_df
    
# Encode all articles using BERT with progress tracking
article_embeddings = np.array([get_bert_embedding(article) for article in tqdm(all_articles, desc="Encoding articles", unit="article")])


Encoding articles: 100%|██████████| 2225/2225 [13:24<00:00,  2.77article/s]


In [14]:
# Function to search similar articles using BERT
def search_similar_articles_bert(query, article_embeddings, all_articles):
    query_embedding = get_bert_embedding(query)
    similarities = cosine_similarity([query_embedding], article_embeddings).flatten()
    
    # Get the indices of the top 5 most similar articles
    top_indices = np.argsort(similarities)[-5:][::-1]
    
    # Create a DataFrame to hold the results
    results_df = pd.DataFrame({
        'Article': np.array(all_articles)[top_indices],
        'Similarity Score': similarities[top_indices]
    })
    
    return results_df

## "London meeting of finance ministers and central bankers"

In [64]:
# Example search query
query = "London meeting of finance ministers and central bankers"

In [65]:
# Get the most similar article and its similarity score
tf_idf_df= search_similar_articles_tfidf(query, tfidf_matrix, unique_articles)
tf_idf_df

Unnamed: 0,Article,Similarity Score
0,eu ministers to mull jet fuel tax european un...,0.15893
1,economy 'strong' in election year uk business...,0.148369
2,eu aiming to fuel development aid european un...,0.142033
3,"india calls for fair trade rules india, which...",0.131749
4,huge rush for jet airways shares indian airli...,0.128655


In [66]:
# Get the most similar article and its similarity score
bert_df = search_similar_articles_bert(query, article_embeddings, unique_articles)
bert_df

Unnamed: 0,Article,Similarity Score
0,profile: david blunkett before he resigned th...,0.640777
1,eu ministers to mull jet fuel tax european un...,0.634139
2,bank voted 8-1 for no rate change the decisio...,0.633243
3,us trade gap ballooned in october the us trad...,0.628107
4,gold falls on imf sale concerns the price of ...,0.627473


## "Amit Yoran leaves the Department of# Example search query
query = "Amit Yoran leaves the Department of Homeland Security" Homeland Security"

In [67]:
# Example search query
query = "Amit Yoran leaves the Department of Homeland Security"

In [68]:
# Get the most similar article and its similarity score
tf_idf_df= search_similar_articles_tfidf(query, tfidf_matrix, unique_articles)
tf_idf_df

Unnamed: 0,Article,Similarity Score
0,rivals of the £400 apple... the mac mini is t...,0.536972
1,microsoft seeking spyware trojan microsoft is...,0.079572
2,bush to outline 'toughest' budget president b...,0.079548
3,slovakia reach hopman cup final slovakia will...,0.079026
4,go-ahead for new internet names the internet ...,0.073931


In [69]:
# Get the most similar article and its similarity score
bert_df = search_similar_articles_bert(query, article_embeddings, unique_articles)
bert_df

Unnamed: 0,Article,Similarity Score
0,us bank 'loses' customer details the bank of ...,0.644067
1,us trade gap ballooned in october the us trad...,0.634573
2,confusion over high-definition tv now that a ...,0.633893
3,nasdaq planning $100m share sale the owner of...,0.633829
4,banker loses sexism claim a former executive ...,0.633829
