# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [46]:
!pip install --quiet bertopic datasets hdbscan flash-attn torch accelerate requests pyarrow==15.0.0

In [47]:
from bertopic import BERTopic
from datasets import load_dataset, Dataset, DatasetDict
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import torch
from tqdm import tqdm
import requests
import time
import re

# Get Wikipedia topic words

## Get Wikipedia article content based on page title

In [None]:
# Load dataset
full_dataset = load_dataset('json', data_files='/content/drive/MyDrive/master-thesis/data/WikiFactCheckInteractionFull.jsonl', split='train')
full_dataset_df = full_dataset.to_pandas()

In [49]:
def strip_archive_suffix(title):
    pattern = r'\/Archive \d+$'
    return re.sub(pattern, '', title)

In [51]:
def get_wikipedia_article_content(page_title):
    page_title = strip_archive_suffix(page_title)
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "explaintext": True,
        "titles": page_title
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching data from Wikipedia API: {response.status_code}")

    data = response.json()
    pages = data.get('query', {}).get('pages', {})
    page = next(iter(pages.values()), {})

    content = page.get('extract', '')
    return content

In [54]:
# Create new dataframe with unique page titles to avoid repeated calls to the api
page_titles =  full_dataset_df['PAGE-TITLE'].unique()
data = {'page_title': page_titles}
df_content = pd.DataFrame(data)

In [None]:
article_contents = []

def process_batch(batch):
    batch_contents = []
    for page_title in tqdm(batch, desc="Processing batch"):
        try:
            article_content = get_wikipedia_article_content(page_title)
            batch_contents.append(article_content)
        except Exception as e:
            batch_contents.append("")
            print(f"Error processing page title {page_title}: {e}")
    return batch_contents

# Make sure api is not overwhelmed
batch_size = 1000
batches = [df_content['page_title'][i:i + batch_size] for i in range(0, len(df_content), batch_size)]

for batch in batches:
    article_contents.extend(process_batch(batch))
    time.sleep(1)

# Add the article content to the df
df_content['wikipedia_article_content'] = article_contents

### Add contents to original dataset

In [59]:
hf_content = Dataset.from_pandas(df_content)
article_content_data = {item['page_title']: item['wikipedia_article_content'] for item in hf_content}

In [None]:
# Function to add article content to full dataset
def add_wikipedia_content(example):
    page_title = example['PAGE-TITLE']
    if page_title in article_content_data:
        example['ARTICLE-CONTENT'] = article_content_data[page_title]
    else:
        example['ARTICLE-CONTENT'] = None
    return example

full_dataset = full_dataset.map(add_wikipedia_content)

In [None]:
# Filter out rows that don't have article content
def is_not_empty_content(example):
    return example['ARTICLE-CONTENT'] not in ['', None]
full_dataset = full_dataset.filter(is_not_empty_content)

# Extract topic words from wikipedia article using TF-IDF

In [None]:
def preprocess_text(text):
    text = re.sub(r'[\W_]+', ' ', text).lower()
    return text

def get_sorted_words(tfidf_matrix, feature_names, doc_index, top_n=1000):
    doc_tfidf = tfidf_matrix[doc_index]

    tfidf_scores = zip(feature_names, doc_tfidf.toarray()[0])

    # Sort words by tfidf scores in descending order and get the top number of words
    sorted_words_with_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    sorted_words = [word for word, _ in sorted_words_with_scores[:top_n]]

    return sorted_words

def process_row(row, tfidf_matrix, feature_names, doc_index, top_n=1000):
    sorted_words = get_sorted_words(tfidf_matrix, feature_names, doc_index, top_n=top_n)

    row['SORTED-WORDS'] = sorted_words
    return row

def preprocess_and_process_dataset(full_dataset):
    full_dataset = full_dataset.map(lambda row: {'ARTICLE-CONTENT': preprocess_text(row['ARTICLE-CONTENT'])})

    article_contents = full_dataset['ARTICLE-CONTENT']

    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit the TF-IDF vectorizer on all article content
    tfidf_matrix = vectorizer.fit_transform(article_contents)
    feature_names = vectorizer.get_feature_names_out()

    top_n_words = 1000

    processed_dataset = full_dataset.map(lambda row, idx: process_row(row, tfidf_matrix, feature_names, idx, top_n=top_n_words), with_indices=True)

    return processed_dataset


processed_dataset = preprocess_and_process_dataset(full_dataset)

print(processed_dataset)

### Example

In [83]:
print(f"Page: {processed_dataset[5]['PAGE-TITLE']}\nTop 10 topic words: {processed_dataset[5]['SORTED-WORDS'][:10]}")

Page: Dried fruit
Top 10 topic words: ['dried', 'fruit', 'drying', 'food', 'fruits', 'raisins', 'figs', 'water', 'sun', 'vacuum']


### Save dataset to file so this only has to be done once

In [None]:
# processed_dataset.to_json('/content/drive/MyDrive/master-thesis/data/data_with_topic_words.jsonl', orient='records', lines=True)

# Load in final dataset incl. topic words

In [84]:
og_dataset = load_dataset('json', data_files='/content/drive/MyDrive/master-thesis/data/data_with_topic_words.jsonl', split='train')
og_dataset

Dataset({
    features: ['DISCUSSION-ID', 'OUTPUT', 'PAGE-TITLE', 'DISCUSSION-TITLE', 'PAGE-ID', 'COMMENTS', 'PAGE-CONTENT', 'SORTED-WORDS'],
    num_rows: 45374
})

### Filter out n number of topic words

In [12]:
def filter_comments(example, n):
    top_n_words = {word.lower() for word in example['SORTED-WORDS'][:n]}
    updated_comments = []
    for comment in example['COMMENTS']:
        comment['TEXT-CLEAN-EXTRA'] = ' '.join([word for word in comment['TEXT-CLEAN'].lower().split() if word not in top_n_words])
        updated_comments.append(comment)
    return {'COMMENTS': updated_comments}

In [None]:
n = 100
dataset = og_dataset.map(lambda example: filter_comments(example, n))

In [15]:
comments = []
discussions = []
for example in dataset['COMMENTS']:
    comment_texts = [comment['TEXT-CLEAN-EXTRA'] for comment in example]
    comments.extend(comment_texts)
    discussions.append(" ".join(comment_texts))
first_comments = [comments[0]['TEXT-CLEAN-EXTRA'] for comments in dataset['COMMENTS']]

In [201]:
len(discussions)

45374

In [24]:
len(comments)

249591

In [16]:
texts = discussions

# Clustering with HDBSCAN

In [None]:
# Load Phi-3 model and tokenizer for labeling
model_name = 'microsoft/Phi-3-mini-128k-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        trust_remote_code=True
    )
labeler = pipeline('text-generation', model=model, tokenizer=tokenizer, return_full_text=False)

In [18]:
# Initialize HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, prediction_data=True)

# Initialize BERTopic
vectorizer_model = CountVectorizer(stop_words='english')
topic_model = BERTopic(hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, verbose=True)

In [None]:
# Fit the model
topics, probabilities = topic_model.fit_transform(texts)

In [20]:
# Extract key terms and representative documents for each topic
topic_info = topic_model.get_topic_info()
topics_representations = []
for topic in topic_info['Topic'].unique():
    # Skip the outlier cluster
    if topic == -1:
        continue
    key_terms = topic_model.get_topic(topic)
    representative_docs_ids = topic_model.get_representative_docs(topic)
    representative_texts = [texts[doc_id] for doc_id in representative_docs_ids]
    topics_representations.append((key_terms, representative_texts))
print(f'Number of topics: {len(topics_representations)}')

Number of topics: 35


In [21]:
# Note: representative text is currently not used in the prompt
def generate_prompt(key_terms, representative_docs):
    key_terms_str = ", ".join([term for term, _ in key_terms])
    doc_snippet = " ".join(representative_docs[:1])
    prompt = f"<|user|>Objective:\nLabel this cluster of Wikipedia Talk Page discussion excerpts with a fact-checking strategy based on the representative keywords given below. First give the argumentation strategy you identified and then explain your answer by highlighting how the given keywords relate to your label.\n\nResponse Template:\nLabel: [Fact-checking strategy]. Explanation: [Cite the keywords that influenced your answer and explain how they relate to the fact-checking strategy.]\n\nRepresentative Keywords:\n{key_terms_str}<|end|><|assistant|>"
    return prompt


In [22]:
# Function to generate labels using llm
def label_topic(key_terms, representative_docs):
    prompt = generate_prompt(key_terms, representative_docs)
    result = labeler(prompt, max_new_tokens=500, num_return_sequences=1)
    return result[0]['generated_text']

In [None]:
# Generate labels
topic_labels = []
key_terms = []
for key_terms, docs in tqdm(topics_representations, desc="Generating labels"):
    try:
        label = label_topic(key_terms, docs)
        topic_labels.append(label)
        key_terms.append(key_terms)
    except Exception as e:
        print(f"Error generating label for key_terms: {key_terms} with docs: {docs}. Error: {e}")
        topic_labels.append("Unknown Topic")

In [25]:
# Visualize topics
topic_model.visualize_topics()

In [34]:
def get_label(input_text):
    pattern = r'Label: (.*?)\. Explanation'
    match = re.search(pattern, input_text)

    if match:
        return match.group(1)
    return None

def get_explanation(input_text):
    pattern = r'Explanation: (.*)'
    match = re.search(pattern, input_text)

    if match:
        return match.group(1)
    return None

In [42]:
# Set pandas settings to display all content and not truncate it.
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [43]:
df_labels = pd.DataFrame()
keyterms = []
labels = []
explanations = []
for i, (key_terms, doc) in enumerate(topics_representations):
    key_terms = [term for term, _ in key_terms[:-1]]
    labels.append(get_label(topic_labels[i]))
    explanations.append(get_explanation(topic_labels[i]))
    key_terms = ", ".join(key_terms)
    keyterms.append(key_terms)
    # print(f"Topic {i+1}: {topic_labels[i]}\nKey terms: {key_terms}\n\n")
df_labels['labels'] = labels
df_labels['keyterms'] = keyterms
# df_labels['explanations'] = explanations

In [44]:
df_labels

Unnamed: 0,labels,keyterms
0,Source Verification and Cross-Referencing,"article, sources, war, source, dont, think, just, like, russian, language"
1,Source Verification and Cross-Referencing,"music, album, song, sales, rock, band, article, source, albums, metal"
2,Corroboration and Source Verification,"article, think, field, mass, just, theory, does, energy, correct, dont"
3,Cross-Verification with Multiple Sources,"team, football, game, league, article, club, games, players, teams, player"
4,Source Verification and Cross-Referencing,"church, jesus, article, god, catholic, christian, think, bible, source, sources"
5,Verification of Medical Claims,"article, study, medical, sources, source, studies, research, section, health, dont"
6,Visual Evidence Verification,"image, images, photo, picture, flag, article, use, pictures, photos, think"
7,Geographic Verification and Source Cross-Referencing,"city, area, map, county, town, article, station, bridge, street, population"
8,Investigative Editing and Community Monitoring,"edits, edit, page, vandalism, article, reverted, talk, revert, editor, editors"
9,Verification of Sources and Cross-Referencing,"references, citations, citation, article, tag, reference, page, tags, section, sources"
