# Installations and imports

In [None]:
!pip install tensorflow tensorflow-hub pandas

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install bertopic

In [None]:
!pip install nltk

In [None]:
!pip install gensim

In [None]:
pip install numpy==1.26.4

In [None]:
!pip install pandas==2.2.2

In [None]:
!pip install google-cloud-translate

In [None]:
!pip install --upgrade httpx

In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
!pip install --upgrade httpx

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import os
import time
import csv
import requests
from google.cloud import translate_v2 as translate
from googletrans import Translator
from tqdm.auto import tqdm
import re
import ast
import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
from collections import defaultdict
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
import html
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
import spacy

#Importing packages for LDA
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import logging
import sys
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import networkx as nx

# Packages for fine tuning BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances

# For punctuation
from transformers import pipeline
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# For embeddings visualization
import umap
from sklearn.manifold import TSNE

# Importing collected transcripts

In [None]:
# Read all collected dfs
df_1 = pd.read_csv() # File location was removed
df_1.head(1)

In [None]:
df_2 = pd.read_csv() # File location was removed
df_2.head(1)

In [None]:
df_3 = pd.read_csv() # File location was removed
df_3.head(1)

In [None]:
df_4 = pd.read_csv() # File location was removed
df_4.head(1)

In [None]:
print(df_1.shape)
print(df_2.shape)
print(df_3.shape)
print(df_4.shape)

In [None]:
# Drop unnecesary columns and merge dfs

# Preparing the dfs for merge
df_1 = df_1.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_2 = df_2.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2'])
df_3 = df_3.drop(columns=['Unnamed: 0'])
df_4 = df_4.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

# Rename transcript column in df4
df_4.rename(columns={"transcripts": "transcript"}, inplace=True)

# Merging
df_combined = pd.concat([df_1, df_2, df_3, df_4], axis=0, ignore_index=True)
df_combined.shape

In [None]:
# Counting the duplicated video ids

value_counts = df_combined['video_id'].value_counts()
duplicates = value_counts[value_counts > 1]
print(len(duplicates))

In [None]:
# Drop duplicated video ids

df_combined = df_combined.drop_duplicates(subset=['video_id'], keep='first')
df_combined.shape

In [None]:
# Count transcripts with errors
error_count = df_combined['transcript'].str.startswith("Error: ").sum()
print("Number of error rows:", error_count)

In [None]:
# Drop rows with errors
mask = ~df_combined['transcript'].fillna("").str.startswith("Error: ")
df_combined = df_combined[mask]

df_combined.shape

In [None]:
df_combined.head()

In [None]:
# Chech for empty transcripts

nan_count = df_combined["transcript"].isna().sum()
empty_string_count = (df_combined["transcript"] == "").sum()
total_count = nan_count + empty_string_count

print(f"Number of NaN values: {nan_count}")
print(f"Number of empty strings: {empty_string_count}")
print(f"Total count of NaN and empty strings: {total_count}")

In [None]:
# Drop NaN transcripts

df_combined = df_combined.dropna(subset=['transcript'])
df_combined.shape

# Exploratory data analysis

In [None]:
df_combined['channel'].value_counts()

In [None]:
# Replace channel ids by channel name

channel_mapping = {
    'UCpSJ5fGhmAME9Kx2D3ZvN3Q': 'Latina Noticias',
    'UCPhm2I2wk4vqjENwhn3px8A': 'America Noticias',
    'UC-B7Xv56uNRDkj0vC3QW8Cg': 'La Republica',
    'UCYG5uXS3xdsoaXIxum1pAEw': 'ATV Noticias',
    'UCLtGUPjKLqa3zgdmhKCZONg': 'El Comercio',
    'UC5j8-2FT0ZMMBkmK72R4aeA': 'RPP Noticias'
    }

df_combined['channel'] = df_combined['channel'].map(channel_mapping)
df_combined['channel'].value_counts()

In [None]:
# Overview of monthly videos per channel

# Pre processing and grouping
df_combined['video_date'] = pd.to_datetime(df_combined['video_date'])
df_combined['month_year'] = df_combined['video_date'].dt.to_period('M')
grouped = df_combined.groupby(['month_year', 'channel']).agg({'video_id': 'count'}).reset_index()
pivot_table = grouped.pivot(index='month_year', columns='channel', values='video_id').fillna(0)
pivot_table.index = pivot_table.index.astype(str)

# Plot
pivot_table.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Number of Video IDs by Month-Year and Channel', fontsize=16)
plt.xlabel('Month-Year', fontsize=14)
plt.ylabel('Number of Videos', fontsize=14)
plt.xticks(rotation=45, fontsize=8)
plt.legend(title='Channel ID', fontsize=12)
plt.tight_layout()

plt.show()

In [None]:
# Checking transcripts length

df_combined['transcript_length'] = df_combined['transcript'].str.len()


plt.hist(df_combined['transcript_length'], bins=20, edgecolor='black')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Transcript Length')
plt.show()

In [None]:
# Checking transcripts length

df_combined['transcript_length'] = df_combined['transcript'].str.len()

df_filtered = df_combined[df_combined['transcript_length']<=20000]

plt.hist(df_filtered['transcript_length'], bins=30, edgecolor='black')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Transcript Length')
plt.show()

In [None]:
# Cutting down to videos below 10k characters (to avoid full tv show episodes and other longer videos)

len(df_combined[df_combined['transcript_length'] <= 10000])

In [None]:
df_combined.shape

In [None]:
df_combined = df_combined[df_combined['transcript_length'] <= 10000]
df_combined.shape

In [None]:
# Checking the numbers of shorts

count = df_combined['title'].str.contains('#short', case=False, na=False).sum()
print(f"Number of titles containing '#short': {count}")

In [None]:
# Checking if all shorts include #short in their title

# API credentials
API_KEY = # API Key was removed
VIDEO_ID = # Video ID was removed

# Create API endpoint
url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={VIDEO_ID}&key={API_KEY}"

# Make the request
response = requests.get(url)
data = response.json()

# Extract the title
if 'items' in data and len(data['items']) > 0:
    video_title = data['items'][0]['snippet']['title']
    print(f"Video Title: {video_title}")
else:
    print("Video not found or invalid ID.")

In [None]:
# Checking length of videos with '#short' in their title

short_videos = df_combined[df_combined["title"].str.contains("#short", case=False, na=False)]

# Calculate the length of the transcript
short_videos["transcript_length"] = short_videos["transcript"].apply(len)

# Plot the histogram
plt.hist(short_videos["transcript_length"], bins=10, color='skyblue', edgecolor='black')
plt.title("Histogram of Transcript Lengths for '#short' Videos")
plt.xlabel("Transcript Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Testing how big would the sample be if we remove videos below 1k characters transcripts
# 82% of original dataset

filtered_df = df_combined[df_combined["transcript_length"] >= 1000]
remaining_rows = len(filtered_df)
print(f"Number of rows remaining: {remaining_rows}")

In [None]:
# Removing shorts

df_combined = df_combined[df_combined["transcript_length"] >= 1000]

In [None]:
df_combined.shape

# Preprocessing transcripts in Spanish

In [None]:
df_combined.head()

In [None]:
# Function for cleaning the transcripts

def preprocess_spanish(text):

    # Remove "[Música]" (music)
    text = re.sub(r'\[Música\]', ' ', text)

    # Remove "[Aplausos]" (applause)
    text = re.sub(r'\[Aplausos\]', ' ', text)

    # Join transcripts that are lists into a single string
    if isinstance(text, str) and text.startswith("[") and text.endswith("]"):
        try:
            text = ast.literal_eval(text)
        except ValueError:
            pass
    if isinstance(text, list):
        text = " ".join(text)
    if not isinstance(text, str):
        text = str(text)

    # Split at capital letters (some sentences are together)
    text = " ".join(re.split(r'(?=[A-Z])', text))

    # Split at commas and dots, then join back with a space
    text = ''.join(part for part in re.split(r'([.,])', text) if part).replace(',', ', ').replace('.', '. ')

    # Remove double spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
tqdm.pandas()
df_combined['clean_transcript']=df_combined['transcript'].progress_apply(lambda x:preprocess_spanish(x))
df_combined.head()

In [None]:
# Sample the df to try punctuation first

sample_df = df_combined.sample(n=1000, random_state=42)
sample_df.head()

In [None]:
# Pre trained model for adding punctuation to transcripts

tokenizer = AutoTokenizer.from_pretrained("kredor/punctuate-all")
model = AutoModelForTokenClassification.from_pretrained("kredor/punctuate-all")

def restore_punctuation_raw(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    raw_output = " ".join([
        tokens[i] if pred == 0 else tokens[i] + tokenizer.decode([pred]).strip()
        for i, pred in enumerate(predictions[0])
    ])

    return raw_output

def restore_punctuation_large_text(text, max_length=2500):
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]

    restored_chunks = []
    for chunk in chunks:
        tokenized_chunk = tokenizer.encode(chunk, truncation=True, max_length=512)
        decoded_chunk = tokenizer.decode(tokenized_chunk)

        try:
            restored_chunks.append(restore_punctuation_raw(decoded_chunk))
        except Exception as e:
            print(f"Error processing chunk: {e}")
            restored_chunks.append(decoded_chunk)

    restored_text = ' '.join(restored_chunks)

    return restored_text

In [None]:
# New version of punctuation for large texts

def new_restore_punctuation_large_text(text, max_length=2500):
    overlap = 200
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length - overlap)]

    restored_chunks = []
    for chunk in chunks:
        try:
            restored_chunks.append(restore_punctuation_raw(chunk))
        except Exception as e:
            print(f"Error processing chunk: {e}")
            restored_chunks.append(chunk)

    restored_text = ' '.join(restored_chunks)
    return restored_text

In [None]:
# Function for cleaning up punctuated output

def clean_punctuated_output(raw_output):
    # Replace <pad> with dots
    output_with_dots = raw_output.replace("<pad>", ".")

    # Remove special tokens
    output_without_special_tokens = re.sub(r'<[^>]+>', '', output_with_dots)

    # Remove all whitespaces
    output_without_whitespaces = re.sub(r'\s+', '', output_without_special_tokens)

    # Remove subword markers (▁)
    output_without_subwords = output_without_whitespaces.replace("▁", " ")

    # Remove unintended periods within words
    fixed_inner_dots = re.sub(r'(?<!\s)\.(?!\s)', '', output_without_subwords)

    # Remove unintended commas within words
    fixed_inner_commas = re.sub(r'(?<!\s)\,(?!\s)', '', fixed_inner_dots)

    # Normalize spaces
    final_output = re.sub(r'\s+', ' ', fixed_inner_commas).strip()

    return final_output

In [None]:
def process_punctuation(row):
    raw_result = restore_punctuation_large_text(row)
    cleaned_result = clean_punctuated_output(raw_result)
    return cleaned_result

In [None]:
# New version for validation

def new_process_punctuation(row):
    raw_result = new_restore_punctuation_large_text(row)
    cleaned_result = clean_punctuated_output(raw_result)
    return cleaned_result

In [None]:
sample_df['punctuated_transcript_2'] = sample_df['clean_transcript'].progress_apply(process_punctuation)

sample_df.head()

In [None]:
# Download file to validate

sample_df.to_csv("sample_df.csv", encoding='utf-8-sig')

In [None]:
# Splitting df in four parts to run punctuation

split_df = np.array_split(df_combined, 10)

df_combined_1 = split_df[0]
df_combined_2 = split_df[1]
df_combined_3 = split_df[2]
df_combined_4 = split_df[3]
df_combined_5 = split_df[4]
df_combined_6 = split_df[5]
df_combined_7 = split_df[6]
df_combined_8 = split_df[7]
df_combined_9 = split_df[8]
df_combined_10 = split_df[9]

In [None]:
# Run punctuation function on first part

df_combined_1['punctuated_transcript'] = df_combined_1['clean_transcript'].progress_apply(process_punctuation)

In [None]:
# Download first part as backup

df_combined_1.to_csv() # File location was removed

In [None]:
# Run punctuation function on second part

df_combined_2['punctuated_transcript'] = df_combined_2['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_2.to_csv() # File location was removed

In [None]:
# Run punctuation function on third part

df_combined_3['punctuated_transcript'] = df_combined_3['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_3.to_csv() # File location was removed

In [None]:
# Run punctuation function on fourth part

df_combined_4['punctuated_transcript'] = df_combined_4['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_4.to_csv() # File location was removed

In [None]:
# Run punctuation function on 5th part

df_combined_5['punctuated_transcript'] = df_combined_5['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_5.to_csv() # File location was removed

In [None]:
# Run punctuation function on 6th part

df_combined_6['punctuated_transcript'] = df_combined_6['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_6.to_csv() # File location was removed

In [None]:
# Run punctuation function on 7th part
df_combined_7['punctuated_transcript'] = df_combined_7['clean_transcript'].progress_apply(process_punctuation)

In [None]:
df_combined_7.to_csv() # File location was removed

In [None]:
# Read file with 8th part

df_combined_8 = pd.read_csv() # File location was removed
df_combined_8.head()

In [None]:
# Run punctuation function on 8th part

tqdm.pandas()
df_combined_8['punctuated_transcript'] = df_combined_8['clean_transcript'].progress_apply(process_punctuation)

In [None]:
# Saved non punctuated yet

df_combined_8.to_csv() # File location was removed

In [None]:
# Read file with 9th part

df_combined_9 = pd.read_csv() # File location was removed
df_combined_9.shape

In [None]:
# Run punctuation function on 9th part
df_combined_9['punctuated_transcript'] = df_combined_9['clean_transcript'].progress_apply(process_punctuation)

In [None]:
# Saved non punctuated yet

df_combined_9.to_csv() # File location was removed

In [None]:
# Read file with 10th part

df_combined_10 = pd.read_csv() # File location was removed
df_combined_10.shape

In [None]:
# Run punctuation function on 10th part

df_combined_10['punctuated_transcript'] = df_combined_10['clean_transcript'].progress_apply(process_punctuation)

In [None]:
# Saved non punctuated yet

df_combined_10.to_csv() # File location was removed

Validating punctuation in full df

In [None]:
# Read all punctuated files

df_combined_1 = pd.read_csv() # File location was removed
df_combined_2 = pd.read_csv() # File location was removed
df_combined_3 = pd.read_csv() # File location was removed
df_combined_4 = pd.read_csv() # File location was removed
df_combined_5 = pd.read_csv() # File location was removed
df_combined_6 = pd.read_csv() # File location was removed
df_combined_7 = pd.read_csv() # File location was removed
df_combined_8 = pd.read_csv() # File location was removed
df_combined_9 = pd.read_csv() # File location was removed
df_combined_10 = pd.read_csv() # File location was removed

In [None]:
# Validate structure

print(df_combined_1.shape)
print(df_combined_2.shape)
print(df_combined_3.shape)
print(df_combined_4.shape)
print(df_combined_5.shape)
print(df_combined_6.shape)
print(df_combined_7.shape)
print(df_combined_8.shape)
print(df_combined_9.shape)
print(df_combined_10.shape)

In [None]:
df_combined_7.head(1)

In [None]:
# For files 1-7, remove Unnamed:0 (make it index)

df_combined_1.set_index('Unnamed: 0', inplace=True)
df_combined_2.set_index('Unnamed: 0', inplace=True)
df_combined_3.set_index('Unnamed: 0', inplace=True)
df_combined_4.set_index('Unnamed: 0', inplace=True)
df_combined_5.set_index('Unnamed: 0', inplace=True)
df_combined_6.set_index('Unnamed: 0', inplace=True)
df_combined_7.set_index('Unnamed: 0', inplace=True)

print(df_combined_4.shape)
df_combined_7.head(1)

In [None]:
df_combined_10.head(1)

In [None]:
# For files 8-10, remove Unnamed:0 (make it index), remove Unnamed: 0.1

df_combined_8.set_index('Unnamed: 0', inplace=True)
df_combined_9.set_index('Unnamed: 0', inplace=True)
df_combined_10.set_index('Unnamed: 0', inplace=True)

df_combined_8.drop(columns=['Unnamed: 0.1'], inplace=True)
df_combined_9.drop(columns=['Unnamed: 0.1'], inplace=True)
df_combined_10.drop(columns=['Unnamed: 0.1'], inplace=True)

print(df_combined_9.shape)
df_combined_10.head(1)

In [None]:
# Merge all files

dfs = [df_combined_1, df_combined_2, df_combined_3, df_combined_4, df_combined_5, df_combined_6, df_combined_7,
       df_combined_8, df_combined_9, df_combined_10]

df_combined = pd.concat(dfs)
df_combined.index.name = 'index'

print(df_combined.shape)
df_combined.tail()

In [None]:
# Create column with length of clean transcript

df_combined['clean_transcript_length'] = df_combined['clean_transcript'].apply(len)
df_combined.head(1)

In [None]:
# Create column with length of punctuated transcript

df_combined['punct_transcript_length'] = df_combined['punctuated_transcript'].apply(len)
df_combined.head(1)

In [None]:
# Count how many rows have shorter puncuated than non punctuated transcripts
# In many cases brackets or other symbols are removed, so being shorter is not necessarily bad
# (around a 10% longer or shorter range)

count = len(df_combined[df_combined['punct_transcript_length'] < df_combined['clean_transcript_length']])
count

In [None]:
# Counting rows with over 10% difference in length between clean and punctuated transcripts

df_combined['length_difference_ratio'] = (
    df_combined['clean_transcript_length'] -
    df_combined['punct_transcript_length']) / df_combined['clean_transcript_length']

row_count = len(df_combined[df_combined['length_difference_ratio'] > 0.1])
row_count

In [None]:
# Download file to manually validate cases with big differences

df_combined.to_csv() # File location was removed

In [None]:
# From manual validation: some transcripts were being cut-off of middle of the text section
# This was due to the punctuation model max tokens of 512
# Overlap was added to a new version of the function to keep tokens that fall between the 512 max tokens and 2500 characters cutoff

In [None]:
# Testing re-running punctuation on one transcript

test_transcript = df_combined.loc[9304, 'clean_transcript']
test_transcript

In [None]:
# Running punctuation with new function for large texts

print(new_process_punctuation(test_transcript))

In [None]:
# Re-running punctuation with improved large text handling in top 1000 videos with highest length ratio

top_1000 = df_combined.sort_values(by='length_difference_ratio', ascending=False).head(1000)
tqdm.pandas()
top_1000['punctuated_transcript'] = top_1000['clean_transcript'].progress_apply(new_process_punctuation)

df_combined.loc[top_1000.index, 'punctuated_transcript'] = top_1000['punctuated_transcript']

In [None]:
# Counting rows with over 10% difference in length between clean and punctuated transcripts

df_combined['punct_transcript_length'] = df_combined['punctuated_transcript'].apply(len)

df_combined['length_difference_ratio'] = (
    df_combined['clean_transcript_length'] -
    df_combined['punct_transcript_length']) / df_combined['clean_transcript_length']

row_count = len(df_combined[df_combined['length_difference_ratio'] > 0.1])
row_count

In [None]:
# Download file

df_combined.to_csv() # File location was removed

# Removing videos with other relevant topics (not crime related) found in first sample

In [None]:
# Creating list of keywords
# Clean up topics: ukraine, russia, israel, netanyahu, palestine, trump, kamala
# Keywords "israel" and "harris" are left out as they are common terms that could appear in crime related topics

other_topics_keywords = ["ucrania", "rusia", "netanyahu", "palestina", "trump", "kamala", "joe biden", "elon musk"]

In [None]:
# Define function for counting rows

def contains_keywords(text, keywords):
    pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    return bool(re.search(pattern, text.lower()))

In [None]:
# Apply function to df

count = df_combined['punctuated_transcript'].apply(lambda x: contains_keywords(x, other_topics_keywords)).sum()
count

In [None]:
# Create subset of rows with keywords, remove them from main df

print(df_combined.shape)

matching_rows = df_combined[df_combined['punctuated_transcript'].apply(lambda x: contains_keywords(x, other_topics_keywords))].copy()
df_combined_clean = df_combined[~df_combined['punctuated_transcript'].apply(lambda x: contains_keywords(x, other_topics_keywords))].copy()

print(matching_rows.shape)
print(df_combined_clean.shape)

In [None]:
# Download subset for validation

matching_rows.to_csv() # File location was removed

In [None]:
# Download clean df to check for other topics

df_combined_clean.to_csv() # File location was removed

In [None]:
# From manual validation of 100 videos: 95% accuracy (5 unrelated videos found)
# Most of them related to the report of an international crime, including the country name in the video title

# Checking number of videos with other country names in the title

In [None]:
# Creating list of country names excluding peru

# Reading file with names list
country_names = pd.read_csv() # File location was removed

country_names['country_name'] = country_names['country_name'].str.strip().str.lower()

# Remove: Peru, Venezuela, El Salvador (peruvian districts with this name), granada (grenade), argentina (main avenue with this name)
excluded_countries = ['perú', 'venezuela', 'argentina', 'el salvador', 'granada']
country_list = country_names[~country_names['country_name'].isin(excluded_countries)]['country_name'].tolist()

country_list[0:5]

In [None]:
# Count rows in df with a country name in the title

def contains_country(text, countries):
    pattern = r'\b(?:' + '|'.join(map(re.escape, countries)) + r')\b'
    return bool(re.search(pattern, text.lower()))

count = df_combined_clean['title'].apply(lambda x: contains_country(x, country_list)).sum()
count

In [None]:
# Create subset of videos with country names to validate accuracy
countries_subset = df_combined_clean[df_combined_clean['title'].apply(lambda x: contains_country(x, country_list))].copy()

# Download to validate
countries_subset.to_csv() # File location was removed

In [None]:
# From validation: excluding country names is very accurate. Small part of the found videos (6%) could be relevant to analysis,
# but more is gained from removing the non-relevant part of the 350 videos found

# Updating dataset to remove videos with country names in title
df_combined_clean = df_combined[~df_combined['title'].apply(lambda x: contains_country(x, country_list))].copy()
df_combined_clean.shape

In [None]:
# Download final clean df

df_combined_clean.to_csv() # File location was removed

Validating final number of videos overtime

In [None]:
df_combined_clean.head()

In [None]:
# Overall number of videos

df_combined_clean['video_date'] = pd.to_datetime(df_combined_clean['video_date'])
df_combined_clean['year_month'] = df_combined_clean['video_date'].dt.to_period('M')
monthly_counts = df_combined_clean.groupby('year_month').size()

plt.figure(figsize=(10, 6))
bars = plt.bar(monthly_counts.index.astype(str), monthly_counts, color='skyblue')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(height), ha='center', va='bottom', fontsize=10)

plt.title('Number of Videos Per Month', fontsize=14)
plt.xlabel('Month', fontsize=8)
plt.ylabel('Number of Videos', fontsize=8)
plt.xticks(rotation=45, fontsize = 8)
plt.tight_layout()

plt.show()

# Translations

In [None]:
df_combined_clean.head()

In [None]:
# Read df with 1000 videos that were initially translated as test

df_sample_translations = pd.read_csv() # File location was removed
df_sample_translations.head()

In [None]:
# Add translations to main df for videos that were translated in the sample

df_combined_clean = df_combined_clean.merge(
    df_sample_translations[['video_id', 'translated_transcript']],
    on='video_id',
    how='left'
)

matches_count = df_combined_clean['translated_transcript'].notna().sum()
matches_count

In [None]:
# Number of total videos

df_combined_clean.shape

Create subsets of the df to run translation without exceeding google API quota

In [None]:
# Remove videos that were already translated

translated_subset = df_combined_clean[df_combined_clean['translated_transcript'].notna()].copy()
print(translated_subset.shape)

# Remaining videos to translate
df_pending_translation = df_combined_clean[df_combined_clean['translated_transcript'].isna()].copy()
print(df_pending_translation.shape)

In [None]:
# Save already translated videos to csv
translated_subset.to_csv() # File location was removed

In [None]:
# Sort df by transcript length (to prioritize long transcripts with paid API)

df_pending_translation = df_pending_translation.sort_values(by='punct_transcript_length', ascending=False)
df_pending_translation.reset_index(drop=True, inplace=True)
df_pending_translation.head()

In [None]:
# Create subset 1: up to 11,7 million characters
subset_to_translate_1 = df_pending_translation.iloc[:1445].copy()

total_string_length = subset_to_translate_1['punctuated_transcript'].apply(len).sum()
total_string_length

In [None]:
# Save subset to csv as backup
subset_to_translate_1.to_csv() # File location was removed

In [None]:
# Update main df without first subset
df_pending_translation = df_pending_translation.drop(subset_to_translate_1.index)

print(df_pending_translation.shape)

total_string_length = df_pending_translation['punctuated_transcript'].apply(len).sum()
print(total_string_length)

In [None]:
df_pending_translation.head()

In [None]:
# Splitting remaining rows into two df with equal string length

subset_to_translate_2 = df_pending_translation.iloc[:2360].copy()

total_string_length = subset_to_translate_2['punctuated_transcript'].apply(len).sum()
total_string_length

In [None]:
# Save subset to csv as backup
subset_to_translate_2.to_csv() # File location was removed

In [None]:
# Update main df as third subset
df_pending_translation = df_pending_translation.drop(subset_to_translate_2.index)

print(df_pending_translation.shape)

total_string_length = df_pending_translation['punctuated_transcript'].apply(len).sum()
print(total_string_length)

In [None]:
# Save subset to csv as backup
subset_to_translate_3 = df_pending_translation
subset_to_translate_3.to_csv() # File location was removed

In [None]:
# Validate rows
print(df_combined_clean.shape)
print(translated_subset.shape)
print(subset_to_translate_1.shape)
print(subset_to_translate_2.shape)
print(subset_to_translate_3.shape)

Translating sample and first subset

In [None]:
# Initializing Google Translate
credentials = () # File location was removed
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
client = translate.Client()

In [None]:
def translate_text(text, target_language="en", source_language="es"):
    try:
        translation = client.translate(text, target_language=target_language)
        return translation["translatedText"]
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

In [None]:
# Translating the sample df with Google Translate API

tqdm.pandas()
sample_df['translated_transcript'] = sample_df['punctuated_transcript'].progress_apply(translate_text)
sample_df.head()

In [None]:
# Counting rows with errors (translation same as transcript)

row_count = len(sample_df[sample_df['translated_transcript'] == sample_df['punctuated_transcript']])
row_count

In [None]:
# Translating first subset

tqdm.pandas()
subset_to_translate_1['translated_transcript'] = subset_to_translate_1['punctuated_transcript'].progress_apply(translate_text)
subset_to_translate_1.head()

In [None]:
# Save translated subset to csv

subset_to_translate_1.to_csv() # File location was removed

In [None]:
# Counting rows with errors (translation same as transcript)

row_count = len(subset_to_translate_1[subset_to_translate_1['translated_transcript'] == subset_to_translate_1['punctuated_transcript']])
row_count

Translating second subset

In [None]:
# Initializing Google Translate with credentials

credentials = () # File location was removed
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
client = translate.Client()

In [None]:
# Translating second subset

tqdm.pandas()
subset_to_translate_2['translated_transcript'] = subset_to_translate_2['punctuated_transcript'].progress_apply(translate_text)
subset_to_translate_2.head()

In [None]:
# Save translated subset to csv
subset_to_translate_2.to_csv() # File location was removed

In [None]:
# Counting rows with errors (translation same as transcript)

row_count = len(subset_to_translate_2[subset_to_translate_2['translated_transcript'] == subset_to_translate_2['punctuated_transcript']])
row_count

Translating third subset

In [None]:
# Initializing Google Translate with credentials

credentials = () # File location was removed
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
client = translate.Client()

In [None]:
# Translating second subset

tqdm.pandas()
subset_to_translate_3['translated_transcript'] = subset_to_translate_3['punctuated_transcript'].progress_apply(translate_text)
subset_to_translate_3.head()

In [None]:
# Save translated subset to csv

subset_to_translate_3.to_csv() # File location was removed

In [None]:
# Counting rows with errors (translation same as transcript)

row_count = len(subset_to_translate_3[subset_to_translate_3['translated_transcript'] == subset_to_translate_3['punctuated_transcript']])
row_count

Merging full translated df

In [None]:
print(translated_subset.shape)
print(subset_to_translate_1.shape)
print(subset_to_translate_2.shape)
print(subset_to_translate_3.shape)

In [None]:
# Combining all the subsets
df_translated = pd.concat([translated_subset, subset_to_translate_1, subset_to_translate_2, subset_to_translate_3])
df_translated.reset_index(drop=True, inplace=True)

# Shuffle the rows randomly
df_translated = df_translated.sample(frac=1).reset_index(drop=True)
df_translated.shape

In [None]:
df_translated.head()

In [None]:
# Decode translation to remove symbols like in "We&#39;re going"

def decode_html_entities(text):
    return html.unescape(text)

df_translated['translated_transcript'] = df_translated['translated_transcript'].apply(decode_html_entities)
df_translated.head()

In [None]:
# Download final translated df

df_translated.to_csv() # File location was removed

# Categorizing videos mentioning Venezuelan migrants

In [None]:
# Keyword list for identifying videos that mention Venezuelan migrants based on Guevara (2024)
# 'Migrants' and 'foreigners' related keywords are left out, as they are not necessarilly related to Venezuelan migrants
venezuela_keywords = ["venezolano", "venezolana", "venezolanos", "venezolanas", "chamo", "chamas",
                      "chamos", "chamas", "veneco", "veneca", "venecos", "venecas"]

# Function to check if any keyword is in the clean_transcript
def check_keywords(text, keywords):
    pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    return int(bool(re.search(pattern, str(text).lower())))

df_translated['bin_venezuelan'] = df_translated['clean_transcript'].apply(lambda x: check_keywords(x, venezuela_keywords))

df_translated.head()

In [None]:
# Checking overall number of videos than mention Venezuelan migrants

df_translated['bin_venezuelan'].value_counts()

In [None]:
# Checking number of videos over time

grouped = df_translated.groupby(['year_month', 'bin_venezuelan']).size().unstack(fill_value=0)
grouped.plot(kind='bar', stacked=True, figsize=(12, 6), color=['skyblue', 'salmon'])
plt.title('Monthly Number of Videos by Venezuelan Content', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Number of Videos', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.legend(title='Venezuelan Content (bin_venezuelan)', labels=['No (0)', 'Yes (1)'], fontsize=10)
plt.tight_layout()
for i, (index, row) in enumerate(grouped.iterrows()):
    y_offset = 0
    for bin_value in [0, 1]:
        if row[bin_value] > 0:
            plt.text(i, y_offset + row[bin_value] / 2, str(row[bin_value]),
                     ha='center', va='center', fontsize=10, color='black')
            y_offset += row[bin_value]
plt.show()

In [None]:
# Download df with bin_venezuelan for validation

df_translated.to_csv() # File location was removed

# Removing additional not related topics found in transcripts

In [None]:
# Mapping additional not related topics that were found in the transcripts df

# To remove from transcript
other_topics_keywords_2 = ["ucrania", "rusia", "netanyahu", "palestina", "trump", "kamala", "joe biden", "elon musk", "highland park",
                           "robert kart", "george soros", "jlo", "jennifer lopez", "marc anthony", "daniel noboa"]

# To remove from titles
other_topics_title = ["eeuu", "ee.uu", "ee.uu.", "ucraniano", "daniel noboa"]

In [None]:
# Define function for counting rows

def contains_keywords(text, keywords):
    pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    return bool(re.search(pattern, text.lower()))

In [None]:
# Apply function to df

count = df_translated['punctuated_transcript'].apply(lambda x: contains_keywords(x, other_topics_keywords_2)).sum()
count

In [None]:
# Remove rows

print(df_translated.shape)

df_translated = df_translated[~df_translated['punctuated_transcript'].apply(lambda x: contains_keywords(x, other_topics_keywords_2))].copy()

print(df_translated.shape)

In [None]:
# Apply function to video titles

count = df_translated['title'].apply(lambda x: contains_keywords(x, other_topics_title)).sum()
count

In [None]:
# Remove rows

print(df_translated.shape)

df_translated = df_translated[~df_translated['title'].apply(lambda x: contains_keywords(x, other_topics_title))].copy()

print(df_translated.shape)

In [None]:
# Check number of videos over time

grouped = df_translated.groupby(['year_month', 'bin_venezuelan']).size().unstack(['bin_venezuelan'], fill_value=0)

grouped.plot(kind='bar', stacked=True, figsize=(12, 6), color=['skyblue', 'salmon', 'orange'])
plt.title('Monthly Number of Videos by Venezuelan Content', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Number of Videos', fontsize=14)
plt.xticks(rotation=45, fontsize=10)
plt.legend(title='Content Type', labels=[
    'Venezuelan: No (0),
    'Venezuelan: Yes (1)
], fontsize=10)
plt.tight_layout()

# Add data labels
for i, (index, row) in enumerate(grouped.iterrows()):
    y_offset = 0
    for bin_label in row.index:
        if row[bin_label] > 0:
            plt.text(i, y_offset + row[bin_label] / 2, str(row[bin_label]),
                     ha='center', va='center', fontsize=10, color='black')
            y_offset += row[bin_label]

plt.show()

In [None]:
# Download clean df to csv

df_translated_download = df_translated[['video_id', 'title', 'url', 'channel', 'video_date',
       'punctuated_transcript', 'punct_transcript_length',
       'year_month', 'translated_transcript', 'bin_venezuelan',
       'bin_foreigner']]

df_translated_download.to_csv() # File location was removed

# Translating the titles

In [None]:
df_translated = pd.read_csv() # File location was removed
df_translated.shape

In [None]:
# Initialize the translator

translator = Translator()

def translate_short_text(text, src='es', dest='en'):
    try:
        if not text or pd.isna(text):
            return text
        translation = translator.translate(text, src=src, dest=dest)
        if translation and translation.text:
            return translation.text
        else:
            return text
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        return text

In [None]:
translate_short_text("hola mundo")

In [None]:
df_translated.head()

In [None]:
# Translating titles

tqdm.pandas()
df_translated['translated_title'] = df_translated['title'].progress_apply(
    lambda x: (time.sleep(1) or translate_short_text(x)))

df_translated.head()

In [None]:
# Counting rows with errors (translation same as transcript)

row_count = len(df_translated[df_translated['translated_title'] == df_translated['title']])
row_count

In [None]:
# Clean up html in titles
# Decode translation to remove symbols like in "We&#39;re going"

def decode_html_entities(text):
    return html.unescape(text)

df_translated['translated_title'] = df_translated['translated_title'].apply(decode_html_entities)
df_translated.head()

In [None]:
# Download clean df to csv

df_translated_download = df_translated[['video_id', 'title', 'url', 'channel', 'video_date',
       'punctuated_transcript', 'punct_transcript_length',
       'year_month', 'translated_transcript', 'bin_venezuelan',
       'bin_foreigner', 'translated_title']]

In [None]:
df_translated_download.to_csv() # File location was removed

# Further data cleaning (removing non relevant sentences)

In [None]:
df_translated = pd.read_csv() # File location was removed
df_translated.shape

In [None]:
# Function for removing non relevant sentences

def remove_non_relevant_sentences(transcripts, interaction_examples):

    # Load spaCy and a pre-trained sentence transformer
    nlp = spacy.load("en_core_web_sm")
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Generate embeddings for examples
    interaction_embeddings = embedder.encode(interaction_examples)

    doc = nlp(transcripts)

    filtered_sentences = []

    for sentence in doc.sents:
        sentence_text = sentence.text.strip()

        # Remove very short sentences, probably not meaningful
        if len(sentence_text) < 5 or len(sentence_text.split()) < 2:
            continue

        # Check Part of Speech to keep only meaningful content (at least two meaningful tokens)
        tokens = [token for token in sentence if token.pos_ in {"NOUN", "VERB", "ADJ"}]
        if len(tokens) < 2:
            continue

        # Check similarity to examples
        sentence_embedding = embedder.encode([sentence_text])[0]
        similarity_to_interaction = max(embedder.similarity(interaction_embeddings, sentence_embedding))

        # Keep only sentences that are not too similar to interactions
        if similarity_to_interaction < 0.5:
            filtered_sentences.append(sentence.text)

    cleaned_transcript = " ".join(filtered_sentences)
    return cleaned_transcript

In [None]:
# Create list of sentences that contain non relevant live interaction examples

interaction_examples = [
    "Good morning, thank you.",
    "Good morning",
    "What's this about?",
    "Go ahead",
    "How are you?",
    "Thanks for the information.",
    "Perfect, okay, very good.",
    "Let's see",
    "7:30 am",
    "That's right"
]

In [None]:
# Test with one transcript

transcript = # Transcript content was removed

remove_non_relevant_sentences(transcript, interaction_examples)

In [None]:
transcript

In [None]:
# Running function on full df, saving as it progresses to save data in case of crash

output_file = # File location was removed

# Initialize the output file

if not os.path.exists(output_file):
    with open(output_file, mode="w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["translated_transcript", "filtered_transcript"])

# Process the transcripts saving after each row

tqdm.pandas()
with open(output_file, mode="a", encoding="utf-8-sig", newline="") as f:
    writer = csv.writer(f)
    for index, row in tqdm(df_translated.iterrows(), total=len(df_translated)):
        filtered_transcript = remove_non_relevant_sentences(
            row["translated_transcript"], interaction_examples)
        writer.writerow([row["translated_transcript"], filtered_transcript])

In [None]:
# Function to continue in case it crashes

output_file = # File location was removed

if os.path.exists(output_file):
    df_processed = pd.read_csv(output_file, encoding="utf-8-sig")
    processed_transcripts = set(df_processed["translated_transcript"])
else:
    processed_transcripts = set()

# Filter out already processed rows
df_unprocessed = df_translated[~df_translated["translated_transcript"].isin(processed_transcripts)]

df_unprocessed.shape

In [None]:
# Process remaining rows

tqdm.pandas()
with open(output_file, "a", encoding="utf-8-sig") as f:
    for index, row in tqdm(df_unprocessed.iterrows(), total=len(df_unprocessed)):
        filtered_transcript = remove_non_relevant_sentences(
            row["translated_transcript"], interaction_examples)

        f.write(f'"{row["translated_transcript"]}","{filtered_transcript}"\n')

In [None]:
# Add the filtered transcripts to the main df

processed_file = # File location was removed

df_processed = pd.read_csv(
    processed_file,
    encoding="utf-8-sig",
    delimiter=",",
    quotechar='"',
    on_bad_lines="skip"
)

df_processed.shape

In [None]:
df_processed.head()

In [None]:
df_merged = pd.merge(df_translated, df_processed, on="translated_transcript", how="left")

df_merged.to_csv() # File location was removed

df_merged.head()

In [None]:
df_merged['filtered_transcript'].isna().sum()

In [None]:
#Applying the cleaning function again for rows that had errors in the original file

def process_empty_filtered_transcripts(row):
    if pd.isna(row['filtered_transcript']) or row['filtered_transcript'] == "":
        return remove_non_relevant_sentences(row['translated_transcript'], interaction_examples)
    return row['filtered_transcript']

df_merged['filtered_transcript'] = df_merged.progress_apply(process_empty_filtered_transcripts, axis=1)

df_merged['filtered_transcript'].isna().sum()

In [None]:
df_merged.to_csv() # File location was removed

# Topic modeling with BERTopic

In [None]:
df_translated = pd.read_csv() # File location was removed
df_translated.shape

In [None]:
df_translated.head()

Pre save embeddings with large sentence embedding model

In [None]:
# Embed the transcripts with large model

embedding_model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
embeddings = embedding_model.encode(df_translated["filtered_transcript"].tolist(), show_progress_bar=True)

In [None]:
# Save embeddings

embedding_df = pd.DataFrame(embeddings)
embedding_df.to_csv() # File location was removed

In [None]:
# Read pre saved embeddings

embedding_df = pd.read_csv() # File location was removed
embedding_df.head()

In [None]:
embedding_df.shape

Optimizing number of clusters for k-means: elbow method

In [None]:
# Transform embeddings into np array
embeddings = embedding_df.to_numpy()

# Calculate within cluster sum of squares (WCSS)
wcss = []
for n in range(2, 26, 1):
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(embeddings)
    wcss.append(kmeans.inertia_)

# Show results as plot
plt.plot(range(2, 26, 1), wcss, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("WCSS")
plt.title("Elbow Method to Optimize Number of Topics")
plt.show()

Optimizing number of clusters for k-means: silhouette score

In [None]:
# Calculate silhouette scores

silhouette_scores = []
for n in range(2, 26, 1):
    kmeans = KMeans(n_clusters=n, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, cluster_labels)
    silhouette_scores.append(score)

# Plot
plt.plot(range(2, 26, 1), silhouette_scores, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score to Optimize Number of Topics")
plt.show()

Clustering with k-means and pre saved embeddings

In [None]:
embeddings.shape

In [None]:
# Initialize K-Means
kmeans_model = KMeans(n_clusters=11, init = 'k-means++', random_state = 42)

# Create representation model
representation_model = PartOfSpeech("en_core_web_sm")

# Initialize UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state = 42)

kmeans_BERT = BERTopic(representation_model=representation_model
                       , umap_model=umap_model
                       )

# Replace HDBSCAN with K-Means
kmeans_BERT.hdbscan_model = kmeans_model

In [None]:
embeddings = embedding_df.to_numpy()

topics3, probs3 = kmeans_BERT.fit_transform(df_translated["filtered_transcript"].tolist(), embeddings)

In [None]:
kmeans_BERT.get_topic_info()

In [None]:
kmeans_model.cluster_centers_

In [None]:
# Since k-means doesnt calculate probabilities, calculating document distance to cluster centers
cluster_centers = kmeans_model.cluster_centers_

# Reduce embeddings to have the same shape as cluster centers
reduced_embeddings = umap_model.fit_transform(embeddings)

# Calculate distances to all cluster centers for each video
cosine_distance_matrix = cosine_distances(reduced_embeddings, kmeans_model.cluster_centers_)

# Calculate Euclidean distances to all cluster centers
euclidean_distance_matrix = np.linalg.norm(
    reduced_embeddings[:, np.newaxis, :] - kmeans_model.cluster_centers_[np.newaxis, :, :], axis=2
)

In [None]:
# Get cluster assignments from K-Means
cluster_labels = kmeans_model.labels_

# Count rows for each cluster
cluster_counts = pd.Series(cluster_labels).value_counts()

# Display counts
print("K-Means Cluster Counts:")
print(cluster_counts)

In [None]:
# Retrieve topic assignments from BERTopic
document_info = kmeans_BERT.get_document_info(df_translated["filtered_transcript"].tolist())

# Count rows for each topic
topic_counts = document_info["Topic"].value_counts()

# Display counts
print("BERTopic Topic Counts:")
print(topic_counts)

In [None]:
# Since BERTopic renames the topics ordered by document count, we need to rename the df columns to match the output asigned topic

# Create a mapping between BERTopic topics and clusters
manual_mapping = {
    "Cluster_0": "Topic_7",
    "Cluster_1": "Topic_1",
    "Cluster_2": "Topic_6",
    "Cluster_3": "Topic_0",
    "Cluster_4": "Topic_8",
    "Cluster_5": "Topic_5",
    "Cluster_6": "Topic_2",
    "Cluster_7": "Topic_10",
    "Cluster_8": "Topic_3",
    "Cluster_9": "Topic_4",
    "Cluster_10": "Topic_9"
}

# Create df with renamed columns
distances_df = pd.DataFrame(
    euclidean_distance_matrix,
    columns=[manual_mapping[f"Cluster_{i}"] for i in range(cluster_centers.shape[0])]
)

# Add additional columns
distances_df["video_id"] = df_translated['video_id']
distances_df["assigned_topics"] = topics3
distances_df["filtered_transcript"] = df_translated['filtered_transcript']

distances_df.head()

In [None]:
# Add topic representations to guide reading

representations_df = pd.DataFrame(kmeans_BERT.get_topic_info())


distances_df = pd.merge(distances_df, representations_df[['Topic', 'Representation']], how = 'left',
                        left_on = 'assigned_topics', right_on = 'Topic')
distances_df.drop(columns = 'Topic', inplace = True)
distances_df.head()

In [None]:
# Save df as csv

distances_df.to_csv() # File location was removed

# Embedding visualizations to understand overlap of clusters

In [None]:
embedding_df.head()

In [None]:
embedding_df.shape

In [None]:
# Getting additional data fields: video_id, translated_transcript, assigned topic

distances_df = pd.read_csv() # File location was removed
distances_df.head()

In [None]:
distances_df.shape

In [None]:
# Merge columns

embedding_df = embedding_df.join(distances_df[['video_id', 'assigned_topics', 'filtered_transcript']])
embedding_df.head()

Visualization tests: UMAP

In [None]:
# Extract embeddings
embedding_cols = [str(i) for i in range(384)]
embeddings = embedding_df[embedding_cols].values

# Extract topic labels
topic_labels = embedding_df["assigned_topics"]

# Manually define topic names for the legend
topic_dict = {
    0: "0: Armed theft to businesses captured on security cameras",
    1: "1: Crime stories through interviews with victims' families",
    2: "2: Motorcycle crimes and witness involvement",
    3: "3: Neighborhood crime stories and self-organized measures",
    4: "4: Police intervention and gang capture cases",
    5: "5: Crimes linked to politics and public protests",
    6: "6: Extortion crimes and violent retaliation",
    7: "7: Robberies executed by organized crime groups",
    8: "8: Kidnapping crimes and police intervention",
    9: "9: Cybertheft through stolen devices",
    10: "10: Crimes involving dogs as victims, perpetrators or protectors"
}

# Apply UMAP with BERTopic parameters
umap_reducer = umap.UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.0,
    metric="cosine"
)
embeddings_umap = umap_reducer.fit_transform(embeddings)

# Plot
plt.figure(figsize=(12, 8))

scatter = plt.scatter(
    embeddings_umap[:, 0], embeddings_umap[:, 1],
    c=topic_labels, cmap="tab20", s=60, edgecolors="k", alpha=0.7
)

handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(i)), markersize=10)
           for i in sorted(topic_dict.keys())]
plt.legend(handles, [topic_dict[i] for i in sorted(topic_dict.keys())], title="Crime Categories", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.title("UMAP Visualization of Crime Embeddings")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.grid(True, linestyle="--", alpha=0.3)

plt.show()