In [None]:
!pip install bertopic
!pip install -U sentence-transformers

In [142]:
import csv

def read_csv_to_list_of_lists(file_path, skip_header=True):
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        return [row for row in csv_reader] if not skip_header else [row for i, row in enumerate(csv_reader) if i > 0]
csv_file_path = 'MetaHate<lang>/MetaHate<lang>_hate_ngrams.csv'
data_ready = read_csv_to_list_of_lists(csv_file_path)

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

processed_docs = [" ".join(doc) for doc in data_ready]
embeddings = model.encode(processed_docs, show_progress_bar=True)

topic_model = BERTopic(language="<lang>", calculate_probabilities=True, verbose=True)
topics, probabilities = topic_model.fit_transform(processed_docs, embeddings)

In [None]:
print("Topics and their top words:")
for i in range(10):
    print(f"Topic {i}: {topic_model.get_topic(i)}")

In [None]:
topic_model.get_topic_info()

In [None]:
# Show top words for each of the top 10 topics
for topic_id in top_10_topics.Topic:
    print(f"\nTopic {topic_id}:")
    print(topic_model.get_topic(topic_id))

In [148]:
topic_labels_es = {
    0: "Feminism & Gender Equality",
    1: "Immigration & Politics",
    2: "Viral Videos & Social Sharing",
    3: "Economic Complaints & Public Spending",
    4: "Racism & Racial Identity",
    5: "Personal Advice & Life Topics",
    6: "Gender & Emotional Expressions",
    7: "Online Insults in Sports",
    8: "Misogynistic Slurs & Aggression",
    9: "Political Extremism & Fascism"
}
topic_labels_pt = {
    0: "Feminism & Gender Equality",
    1: "Videos & Online Content",
    2: "Everyday Conversations & Interactions",
    3: "Insults & Personal Offenses",
    4: "Translation & Language",
    5: "English Phrases & Interpretation",
    6: "Racism & Ethnicity",
    7: "Insults & Cultural/Sports References",
    8: "Sexuality & Sexual Harassment",
    9: "Mixed Insults"
}
topic_labels_gl_es = {
    0: "Informal Speech",
    1: "Automotive & Transportation",
    2: "Feminism & Gender Equality",
    3: "Hate Speech & Hostility",
    4: "Insults & Offensive Language",
    5: "Immigration & Social Issues",
    6: "Racism & Discrimination",
    7: "Fascism & Political Ideology",
    8: "Videos & Content Sharing",
    9: "Offensive Language & Personal Attacks"
}

topics_labels_gl_pt = {
    0: "Informal Expressions",
    1: "Feminism & Gender Advocacy",
    2: "Sexism & Gender Stereotypes",
    3: "Cars & Gendered Interactions",
    4: "Misogyny & Traditional Gender Roles",
    5: "Online Videos",
    6: "Racism & Racial Discrimination",
    7: "Translation & Language Discussion",
    8: "Patriarchy & Masculinity",
    9: "Unemployment & Social Disgust"
}

topics_labels_en = {
    0: "Wikipedia Moderation & Administration",
    1: "Anti-Black Racism & Race Discourse",
    2: "Nihilism, Hate Speech & Extremist Rhetoric",
    3: "Misogyny & Homophobia",
    4: "Pop Culture & Entertainment",
    5: "White Supremacy & White Genocide Narrative",
    6: "Islamophobia & Anti-Muslim Sentiment",
    7: "Aggression & Slurs",
    8: "US Politics & Trump Discourse",
    9: "Anti-Vandalism & Editorial Conflict"
}

topic_model.set_topic_labels(topics_labels_en)
topic_model.save("MetaHate<lang>/MetaHate<lang>_bert_topic_model_hate", save_embedding_model=True)

In [30]:
from bertopic import BERTopic

topic_model = BERTopic.load("MetaHate<lang>/MetaHate<lang>_bert_topic_model_hate")

In [20]:
from nltk.corpus import stopwords
stop_words_en = stopwords.words('english')
stop_words_es = stopwords.words('spanish')
stop_words_pt = stopwords.words('portuguese')

## Word Cloud

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import matplotlib.colors as mcolors
import textwrap

cols = ["#f9cce0", "#f397c2", "#ec639f", "#d2047b", "#a00361", "#700248"]


cloud = WordCloud(stopwords=stop_words_pt,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[num],
                  prefer_horizontal=1.0)

topics_dict = topic_model.get_topics()
selected_indices = [0, 1, 4, 6, 7, 8] # ES
#selected_indices = [0, 3, 6, 7, 8, 9] # PT
#selected_indices = [2, 3, 4, 5, 6, 9] # GL_es
#selected_indices = [1, 2, 3, 4, 6, 8] # GL_pt
#selected_indices = [1, 2, 3, 5, 6, 7] # EN

topic_labels = topics_labels_<lang>
fig, axes = plt.subplots(2, 3, figsize=(10,10), sharex=True, sharey=True)

num = 0
for i, (ax, topic_idx) in enumerate(zip(axes.flatten(), selected_indices)):
    fig.add_subplot(ax)
    if topic_idx in topics_dict:
        topic_words = dict(topics_dict[topic_idx])
        cloud.generate_from_frequencies(topic_words, max_font_size=600)
        topic_label = topic_labels.get(topic_idx, f"Topic {topic_idx}")
        plt.gca().imshow(cloud)
        wrapped_label = "\n".join(textwrap.wrap(topic_label, width=20))
        plt.gca().set_title(wrapped_label, fontdict=dict(size=16))
        plt.gca().axis('off')
        num += 1


plt.subplots_adjust(wspace=-100, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
# t-SNE clustering
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import joblib

docs = [" ".join(doc) for doc in data_ready]

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(docs, show_progress_bar=True)

In [None]:
import numpy as np
topics, probs = topic_model.transform(docs, embeddings=embeddings)
probs = np.array(probs)
print(probs.shape) 

selected_topics = [0, 1, 3, 4, 6, 8] # ES
#selected_topics = [0, 3, 6, 7, 8, 9] # PT
#selected_topics = [2, 3, 4, 5, 6, 9] # GL_es
#selected_topics = [1, 2, 3, 4, 6, 8] # GL_pt
#selected_topics = [1, 2, 3, 5, 6, 7] # EN

probs = probs[:, selected_topics]
probs_selected = normalize(probs, norm='l1', axis=1)
print(probs_selected.shape) 

tsne_model = TSNE(
    n_components=2,
    learning_rate=50,
    n_iter=1500, # 1000
    verbose=1,
    random_state=0,
    angle=.5,
    init='pca',
    perplexity=30 # 40
)
tsne_lda = tsne_model.fit_transform(probs_selected)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

topic_num = np.argmax(probs_selected, axis=1)
colors = np.array(["#f9cce0", "#f397c2", "#ec639f", "#d2047b", "#a00361", "#700248"])
plt.figure(figsize=(8, 8), dpi=300)
scatter = plt.scatter(tsne_lda[:,0], tsne_lda[:,1], c=colors[topic_num], s=8)

handles, _ = scatter.legend_elements(prop='colors')
plt.suptitle("MetaHate<lang> - t-SNE Clustering of Hate Topics")
plt.axis("off")
plt.show()