# Installations and imports

In [None]:
!pip install pysentimiento

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from pysentimiento import create_analyzer
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification
import torch
from bs4 import BeautifulSoup
import html
import re
import unicodedata
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import fisher_exact
import seaborn as sns

# Read comments file and pre process

In [None]:
df = pd.read_csv() # File location was removed
df.head()

In [None]:
df.shape

In [None]:
# Handling empty comments

df['comment'] = df['comment'].fillna("")
df['comment'] = df['comment'].apply(lambda x: x.strip())

df = df[df['comment'] != ""]

Checking length of comments

In [None]:
# Count comments with more than 512 tokens

long_comments = df['comment'].apply(lambda x: len(tokenizer.tokenize(x)) > 512).sum()

print(f"Number of comments exceeding 512 tokens: {long_comments}")

In [None]:
720/1075103

In [None]:
# Leaving out comments with over 512 tokens

df = df[df['comment'].apply(lambda x: len(tokenizer.tokenize(x)) <= 512)]
df.shape

Clean comments to remove html marks

In [None]:
def clean_comment(comment):
    comment = BeautifulSoup(comment, "html.parser").get_text()
    comment = html.unescape(comment)
    return comment

df['comment'] = df['comment'].apply(clean_comment)

Remove other usernames mentions

In [None]:
def remove_mentions(comment):
    return re.sub(r"@@\S+", "", comment)

df['comment'] = df['comment'].apply(remove_mentions)

Further cleaning on the encoding

In [None]:
def normalize_text(comment):
    return unicodedata.normalize("NFKD", comment).encode("ascii", "ignore").decode("ascii")

df['comment'] = df['comment'].apply(normalize_text)

Cleaning whitespaces

In [None]:
def clean_whitespace(comment):
    return " ".join(comment.split())

df['comment'] = df['comment'].apply(clean_whitespace)

Checking tokenized size

In [None]:
# Load the RoBERTuito tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("pysentimiento/robertuito-base-cased")
model = AutoModel.from_pretrained("pysentimiento/robertuito-base-cased")

In [None]:
df['token_count'] = df['comment'].apply(lambda x: len(tokenizer.tokenize(x)))

In [None]:
df.head()

In [None]:
# Check number of comments over max length
long_comments = df[df['token_count'] > 512]
long_comments.shape

In [None]:
long_comments.head()

In [None]:
# Filter out comments with high token counts

df = df[df['token_count'] <= 512]

# Sentiment analysis with BETO

In [None]:
# Load BETO
model_name = "finiteautomata/beto-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Create a sentiment analysis pipeline
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Analyze a test sample
texts = [
    "Estoy muy feliz con el servicio, todo fue excelente.",
    "Este producto es terrible, no lo recomiendo.",
    "La experiencia fue neutral, ni buena ni mala."
]

results = sentiment_pipeline(texts)

for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Label: {result['label']}, Confidence: {result['score']:.4f}")
    print("---")

Testing on sample df

In [None]:
probabilities = []
labels = []

for comment in tqdm(sample_df["comment"], desc="Analyzing Sentiment"):
    try:
        result = sentiment_pipeline(comment)

        if isinstance(result, list) and isinstance(result[0], dict):
            prob_dict = {entry['label']: entry['score'] for entry in result}
            probabilities.append(prob_dict)
            assigned_label = max(prob_dict, key=prob_dict.get)
            labels.append(assigned_label)
        else:
            probabilities.append({})
            labels.append(None)
    except Exception as e:
        print(f"Error processing comment: {comment}\nError: {e}")
        probabilities.append({})
        labels.append(None)

# Create probabilities df
probabilities_df = pd.DataFrame(probabilities, index=sample_df.index[:len(probabilities)]).fillna(0)
sample_df = pd.concat([sample_df.iloc[:len(probabilities)], probabilities_df], axis=1)

# Add assigned sentiment labels
sample_df["assigned_sentiment"] = labels
sample_df.head()

In [None]:
sample_df.shape

Running it on full df

In [None]:
# Saving path for partial results

save_path =  # File location was removed

# Check if there's an existing file
if os.path.exists(save_path):
    partial_df = pd.read_csv(save_path)
    last_processed_idx = len(partial_df)
    probabilities = partial_df.iloc[:, len(df.columns):].to_dict(orient="records")
    labels = partial_df["assigned_sentiment"].tolist()
    print(f"Resuming from index {last_processed_idx}.")
else:
    last_processed_idx = 0
    probabilities = []
    labels = []
    print("Starting from the beginning")

# Process remaining rows
for idx, comment in enumerate(tqdm(df["comment"][last_processed_idx:], desc="Analyzing Sentiment"), start=last_processed_idx):
    try:
        result = sentiment_pipeline(comment)

        if isinstance(result, list) and isinstance(result[0], dict):
            prob_dict = {entry['label']: entry['score'] for entry in result}
            probabilities.append(prob_dict)
            assigned_label = max(prob_dict, key=prob_dict.get)
            labels.append(assigned_label)
        else:
            probabilities.append({})
            labels.append(None)
    except Exception as e:
        print(f"Error processing comment at index {idx}: {comment}\nError: {e}")
        probabilities.append({})
        labels.append(None)

    # Save progress every 1000 rows
    if (idx + 1) % 1000 == 0 or (idx + 1) == len(df):
        probabilities_df = pd.DataFrame(probabilities, index=df.index[:len(probabilities)]).fillna(0)

        partial_df = pd.concat([df.iloc[:len(probabilities)], probabilities_df], axis=1)
        partial_df["assigned_sentiment"] = labels

        partial_df.to_csv(save_path, index=False)
        print(f"Progress saved after {idx + 1} comments.")

# Final save
final_save_path =  # File location was removed
probabilities_df = pd.DataFrame(probabilities, index=df.index[:len(probabilities)]).fillna(0)
final_df = pd.concat([df.iloc[:len(probabilities)], probabilities_df], axis=1)
final_df["assigned_sentiment"] = labels
final_df.to_csv(final_save_path, index=False)

# Comments analysis

Read previous dataframes

In [None]:
comments_df = pd.read_csv() # File location was removed
comments_df.head()

In [None]:
videos_df = pd.read_csv() # File location was removed
videos_df.head()

In [None]:
df_translated = pd.read_csv() # File location was removed
df_translated.head()

Create new df at video level with grouped comments variables

In [None]:
new_df = videos_df[['video_id', 'assigned_topics', 'filtered_transcript']]
new_df.head()

In [None]:
# Merge with additional video info columns

new_df = new_df.merge(df_translated[['video_id', 'video_date', 'channel', 'bin_venezuelan']], on='video_id', how='left')
new_df.head()

Create comment variables grouped at video level

In [None]:
# Count comments per video
video_counts = comments_df.groupby('video_id')['comment_id'].count().rename('total_comments')

# Count sentiment occurrences per video
sentiment_counts = comments_df.groupby(['video_id', 'assigned_sentiment'])['comment_id'].count().unstack(fill_value=0)

# Calculate percentage of each sentiment type
sentiment_percentages = sentiment_counts.div(video_counts, axis=0)

# Rename columns
sentiment_percentages = sentiment_percentages.rename(columns={'POS': 'pos_percent', 'NEG': 'neg_percent', 'NEU': 'neu_percent'})

grouped_df = sentiment_percentages.merge(video_counts, on='video_id')
grouped_df = grouped_df.reset_index()
grouped_df

In [None]:
# Merge comment variables into main df

new_df = new_df.merge(grouped_df, on='video_id', how='left')
new_df.head()

In [None]:
# Fill in total comments with NaN = 0

new_df['total_comments'].fillna(0, inplace = True)
new_df.head()

In [None]:
# Turning topics into binary columns

encoded_topics = pd.get_dummies(new_df['assigned_topics'], prefix='topic', dtype=int)

new_df = pd.concat([new_df, encoded_topics], axis=1)
new_df.head()

In [None]:
# Create binary variables for time frames

# Convert video_date to datetime format
new_df['video_date'] = pd.to_datetime(new_df['video_date'])

# Create binary columns for each time group
new_df['period_2019_2021'] = (new_df['video_date'] <= '2021-12-31').astype(int)
new_df['period_2022'] = ((new_df['video_date'] >= '2022-01-01') & (new_df['video_date'] <= '2022-12-31')).astype(int)
new_df['period_2023'] = ((new_df['video_date'] >= '2023-01-01') & (new_df['video_date'] <= '2023-12-31')).astype(int)
new_df['period_2024'] = ((new_df['video_date'] >= '2024-01-01') & (new_df['video_date'] <= '2024-12-31')).astype(int)

new_df.head()

# Checking overall comment metrics

In [None]:
# Average number of comments per topic

avg_comments = new_df.groupby('assigned_topics')['total_comments'].mean()

# Plot bar chart
fig, ax = plt.subplots(figsize=(10, 3))
avg_comments.plot(kind='bar', color='#3498db', ax=ax, width=0.7)  # Soft blue color

# Add labels
for idx, value in enumerate(avg_comments):
    ax.text(idx, value, f"{value:.1f}", ha='center', va='bottom', fontsize=10, fontweight='bold', color='black')

ax.set_xlabel("Topic")
ax.set_ylabel("Average Number of Comments")
ax.set_title("Average Comments per Topic")
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Comments per topic and time period

time_periods = ['period_2019_2021', 'period_2022', 'period_2023', 'period_2024']
new_df['time_period'] = new_df[time_periods].idxmax(axis=1)

# Group by topic and time period
avg_comments = new_df.groupby(['assigned_topics', 'time_period'])['total_comments'].mean().unstack()

colors = ['#f4a6a6', '#9bd3a3', '#b0b0b0', '#3498db']

# Plot grouped bar chart with multiple bars per topic
fig, ax = plt.subplots(figsize=(12, 3))
avg_comments.plot(kind='bar', ax=ax, color=colors, width=0.7)

# Add labels
for container in ax.containers:
    for rect in container:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            ax.text(x, height, f"{height:.1f}", ha='center', va='bottom', fontsize=10, fontweight='bold', color='black')

ax.set_xlabel("Topic")
ax.set_ylabel("Average Number of Comments")
ax.set_title("Average Comments per Topic Across Time Periods")
ax.legend(title="Time Period", labels=['2019-2021', '2022', '2023', '2024'])

plt.show()

In [None]:
# Validating number of comments per video to remove outliers

df_filtered = new_df[new_df['total_comments'] <= 500]

# Plot histogram of total_comments per video
fig, ax = plt.subplots(figsize=(10, 6))
df_filtered['total_comments'].plot(kind='hist', bins=50, color='#3498db', edgecolor='black', alpha=0.7)

ax.set_xlabel("Number of Comments per Video (≤2000)")
ax.set_ylabel("Frequency")
ax.set_title("Histogram of Comment Counts per Video (Filtered for Outliers)")
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Validate number of videos with over 500 comments

videos_over_500 = new_df[new_df['total_comments'] > 500].shape[0]

# Total number of videos
total_videos = new_df.shape[0]

# Calculate percentage
percentage_over_500 = (videos_over_500 / total_videos) * 100

print(f"Videos with over 500 comments: {videos_over_500} ({percentage_over_500:.2f}%)")

In [None]:
# Manually checking videos with too many comments

videos_over_500 = new_df[new_df['total_comments'] > 500]

videos_over_500.sort_values(by='total_comments', ascending=False).head(10)

In [None]:
# After reviewing the comments sections of the top 10 videos with most comments, it seems like the comments are real and from real users (not bots or any other reason)
# Therefore, only the very high extremes will be removed to keep as much as possible from the sample

In [None]:
# Validate number of videos with over 1000 comments

videos_over_1000 = new_df[new_df['total_comments'] > 1000].shape[0]

# Total number of videos
total_videos = new_df.shape[0]

# Calculate percentage
percentage_over_1000 = (videos_over_1000 / total_videos) * 100

print(f"Videos with over 1000 comments: {videos_over_1000} ({percentage_over_1000:.2f}%)")

In [None]:
# Validate number of videos with no comments

videos_no_comments = new_df[new_df['total_comments'] == 0].shape[0]

# Total number of videos
total_videos = new_df.shape[0]

# Calculate percentage
percentage_no_comments = (videos_no_comments / total_videos) * 100

print(f"Videos with no comments: {videos_no_comments} ({percentage_no_comments:.2f}%)")

In [None]:
# Validating number of comments per video to remove outliers

df_filtered = new_df[new_df['total_comments'] <= 1000]

# Plot histogram of total_comments per video
fig, ax = plt.subplots(figsize=(10, 6))
df_filtered['total_comments'].plot(kind='hist', bins=50, color='#3498db', edgecolor='black', alpha=0.7)

ax.set_xlabel("Number of Comments per Video (≤2000)")
ax.set_ylabel("Frequency")
ax.set_title("Histogram of Comment Counts per Video (Filtered for Outliers)")
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Filtering out videos with over 1000 comments
print(new_df.shape)
new_df = new_df[new_df['total_comments'] <= 1000]
new_df.shape

# Sentiment visualizations

In [None]:
# Visualizing average sentiment per topic

# Exclude videos with zero comments
df_filtered = new_df[new_df['total_comments'] > 0]

# Calculate total sentiment counts per topic
df_filtered['negative_comments'] = (df_filtered['total_comments'] * df_filtered['neg_percent']).round()
df_filtered['positive_comments'] = (df_filtered['total_comments'] * df_filtered['pos_percent']).round()
df_filtered['neutral_comments'] = (df_filtered['total_comments'] * df_filtered['neu_percent']).round()

# Compute total sentiment proportions per topic
neg_total = df_filtered.groupby('assigned_topics')['negative_comments'].sum()
pos_total = df_filtered.groupby('assigned_topics')['positive_comments'].sum()
neu_total = df_filtered.groupby('assigned_topics')['neutral_comments'].sum()
total_comments_per_topic = df_filtered.groupby('assigned_topics')['total_comments'].sum()

sentiment_proportions = pd.DataFrame({
    'Negative': neg_total / total_comments_per_topic,
    'Positive': pos_total / total_comments_per_topic,
    'Neutral': neu_total / total_comments_per_topic
})

colors = ['#f4a6a6', '#9bd3a3', '#b0b0b0']

# Plot stacked bar chart
fig, ax = plt.subplots(figsize=(10, 4))
bars = sentiment_proportions.plot(kind='bar', stacked=True, ax=ax, color=colors)

# Add labels
for bar in bars.containers:
    for rect in bar:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            y = rect.get_y() + height / 2
            percentage = f"{height * 100:.1f}%"
            ax.text(x, y, percentage, ha='center', va='center', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel("Topic")
ax.set_ylabel("Proportion of Sentiment (%)")
ax.set_title("Stacked Bar Chart of Sentiment Proportions by Topic")
ax.legend(title="Sentiment", labels=['Negative', 'Positive', 'Neutral'], loc='upper left', bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

In [None]:
# Sentiment distribution over time for topic 4

# Exclude videos with zero comments
df_filtered = new_df[new_df['total_comments'] > 0]

# Filter for topic 4 only
df_filtered = df_filtered[df_filtered['assigned_topics'] == 4]

# Calculate total sentiment counts per time period
df_filtered['negative_comments'] = (df_filtered['total_comments'] * df_filtered['neg_percent']).round()
df_filtered['positive_comments'] = (df_filtered['total_comments'] * df_filtered['pos_percent']).round()
df_filtered['neutral_comments'] = (df_filtered['total_comments'] * df_filtered['neu_percent']).round()

# Compute sentiment proportions per time period
neg_total = df_filtered.groupby('time_period')['negative_comments'].sum()
pos_total = df_filtered.groupby('time_period')['positive_comments'].sum()
neu_total = df_filtered.groupby('time_period')['neutral_comments'].sum()
total_comments_per_period = df_filtered.groupby('time_period')['total_comments'].sum()

sentiment_proportions = pd.DataFrame({
    'Negative': neg_total / total_comments_per_period,
    'Positive': pos_total / total_comments_per_period,
    'Neutral': neu_total / total_comments_per_period
})

colors = {'Negative': '#f4a6a6', 'Positive': '#9bd3a3', 'Neutral': '#b0b0b0'}

# Plot
fig, ax = plt.subplots(figsize=(10, 4))
bars = sentiment_proportions.plot(kind='bar', stacked=True, ax=ax, color=colors.values())

# Labels
for container in ax.containers:
    for rect in container:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            y = rect.get_y() + height / 2
            percentage = f"{height * 100:.1f}%"
            ax.text(x, y, percentage, ha='center', va='center', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel("Time Period")
ax.set_ylabel("Proportion of Sentiment (%)")
ax.set_title("Sentiment Distribution for Topic 4 Over Time")
ax.legend(title="Sentiment", loc='upper left', bbox_to_anchor=(1, 1))

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Sentiment distribution over time for topic 8

# Exclude videos with zero comments
df_filtered = new_df[new_df['total_comments'] > 0]

# Filter for topic 8 only
df_filtered = df_filtered[df_filtered['assigned_topics'] == 8]

# Calculate total sentiment counts per time period
df_filtered['negative_comments'] = (df_filtered['total_comments'] * df_filtered['neg_percent']).round()
df_filtered['positive_comments'] = (df_filtered['total_comments'] * df_filtered['pos_percent']).round()
df_filtered['neutral_comments'] = (df_filtered['total_comments'] * df_filtered['neu_percent']).round()

# Compute sentiment proportions per time period
neg_total = df_filtered.groupby('time_period')['negative_comments'].sum()
pos_total = df_filtered.groupby('time_period')['positive_comments'].sum()
neu_total = df_filtered.groupby('time_period')['neutral_comments'].sum()
total_comments_per_period = df_filtered.groupby('time_period')['total_comments'].sum()

sentiment_proportions = pd.DataFrame({
    'Negative': neg_total / total_comments_per_period,
    'Positive': pos_total / total_comments_per_period,
    'Neutral': neu_total / total_comments_per_period
})

colors = {'Negative': '#f4a6a6', 'Positive': '#9bd3a3', 'Neutral': '#b0b0b0'}

# Plot
fig, ax = plt.subplots(figsize=(10, 4))
bars = sentiment_proportions.plot(kind='bar', stacked=True, ax=ax, color=colors.values())

# Labels
for container in ax.containers:
    for rect in container:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            y = rect.get_y() + height / 2
            percentage = f"{height * 100:.1f}%"
            ax.text(x, y, percentage, ha='center', va='center', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel("Time Period")
ax.set_ylabel("Proportion of Sentiment (%)")
ax.set_title("Sentiment Distribution for Topic 8 Over Time")
ax.legend(title="Sentiment", loc='upper left', bbox_to_anchor=(1, 1))

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Negative sentiment distribution per group over time for Topic 4

# Time period labels
pretty_labels = {
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
}

# Filter Topic 4 and exclude videos with zero comments
df_filtered = new_df[(new_df['topic_4'] == 1) & (new_df['total_comments'] > 0)]

# Convert time period into single column
time_periods = list(pretty_labels.keys())
df_filtered['time_period'] = df_filtered[time_periods].idxmax(axis=1).replace(pretty_labels)

# Calculate total negative comments per group
df_filtered['negative_comments'] = (df_filtered['total_comments'] * df_filtered['neg_percent']).round()

# Group by time period and bin_venezuelan, calculating total negative comment proportion
neg_comment_totals = df_filtered.groupby(['time_period', 'bin_venezuelan'])['negative_comments'].sum()
total_comment_totals = df_filtered.groupby(['time_period', 'bin_venezuelan'])['total_comments'].sum()
neg_comment_proportion = (neg_comment_totals / total_comment_totals).unstack()

colors = ['#f08080', '#e74c3c']

# Plot
fig, ax = plt.subplots(figsize=(10, 3))
neg_comment_proportion.plot(kind='bar', ax=ax, color=colors, width=0.7)

for container in ax.containers:
    for rect in container:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            y_adjustment = height * 0.02
            percentage = f"{height * 100:.1f}%"
            ax.text(x, height - y_adjustment, percentage, ha='center', va='bottom', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel("Time Period")
ax.set_ylabel("Proportion of Negative Comments (%)")
ax.set_title("Proportion of Negative Comments in Topic 4 by Venezuelan Mention")
ax.legend(title="Mentions Venezuelans", labels=['No (0)', 'Yes (1)'], bbox_to_anchor=(1, 1))
ax.set_xticklabels(pretty_labels.values(), rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Negative sentiment distribution per group over time for Topic 8

# Time period labels
pretty_labels = {
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
}

# Filter Topic 8 and exclude videos with zero comments
df_filtered = new_df[(new_df['topic_8'] == 1) & (new_df['total_comments'] > 0)]

# Convert time period into single column
time_periods = list(pretty_labels.keys())
df_filtered['time_period'] = df_filtered[time_periods].idxmax(axis=1).replace(pretty_labels)

# Calculate total negative comments per group
df_filtered['negative_comments'] = (df_filtered['total_comments'] * df_filtered['neg_percent']).round()

# Group by time period and bin_venezuelan, calculating total negative comment proportion
neg_comment_totals = df_filtered.groupby(['time_period', 'bin_venezuelan'])['negative_comments'].sum()
total_comment_totals = df_filtered.groupby(['time_period', 'bin_venezuelan'])['total_comments'].sum()
neg_comment_proportion = (neg_comment_totals / total_comment_totals).unstack()

colors = ['#f08080', '#e74c3c']

# Plot
fig, ax = plt.subplots(figsize=(10, 3))
neg_comment_proportion.plot(kind='bar', ax=ax, color=colors, width=0.7)

# Labels
for container in ax.containers:
    for rect in container:
        height = rect.get_height()
        if height > 0:
            x = rect.get_x() + rect.get_width() / 2
            y_adjustment = height * 0.02
            percentage = f"{height * 100:.1f}%"
            ax.text(x, height - y_adjustment, percentage, ha='center', va='bottom', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel("Time Period")
ax.set_ylabel("Proportion of Negative Comments (%)")
ax.set_title("Proportion of Negative Comments in Topic 8 by Venezuelan Mention")
ax.legend(title="Mentions Venezuelans", labels=['No (0)', 'Yes (1)'], bbox_to_anchor=(1, 1))
ax.set_xticklabels(pretty_labels.values(), rotation=0)

plt.tight_layout()
plt.show()

# Fisher's exact test to see if there are differences in negative sentiment

Normal distribution validation

In [None]:
new_df.head()

In [None]:
# Exclude rows where total_comments = 0
filtered_df = new_df[(new_df["total_comments"] > 0)]

# Extract the neg_percent values
neg_percent_values = filtered_df["neg_percent"]

# Plot histogram
sns.histplot(neg_percent_values, bins=30, kde=True, stat="density", color="skyblue")

# Normal distribution curve
mean, std = neg_percent_values.mean(), neg_percent_values.std()
x_values = np.linspace(neg_percent_values.min(), neg_percent_values.max(), 100)
y_values = stats.norm.pdf(x_values, mean, std)
plt.plot(x_values, y_values, color="red", linestyle="dashed", label="Normal Distribution")

plt.xlabel("Percentage of Negative Comments")
plt.ylabel("Density")
plt.title("Distribution of neg_percent")
plt.legend()
plt.show()

In [None]:
# Kolmogorov-Smirnov test

filtered_df = new_df[(new_df["total_comments"] > 0)]
neg_percent_values = filtered_df["neg_percent"]

ks_statistic, p_value = stats.kstest(neg_percent_values, 'norm', args=(neg_percent_values.mean(), neg_percent_values.std()))

# Print results
print(f"Kolmogorov-Smirnov Statistic: {ks_statistic}")
print(f"P-value: {p_value}")

# Interpret results
if p_value < 0.05:
    print("The neg_percent data does NOT follow a normal distribution")
else:
    print("The neg_percent data follows a normal distribution")

In [None]:
# Validating video counts in each group

filtered_df = new_df[(new_df["assigned_topics"].isin([4, 8]))&(new_df["total_comments"] > 0)]
video_counts = filtered_df.groupby(["assigned_topics", "time_period", "bin_venezuelan"]).size().reset_index(name="video_count")
print(video_counts)

Topic 4 tests

In [None]:
# Filter dataset for topic 4 and remove videos with no comments

filtered_df = new_df[(new_df["assigned_topics"] == 4) & (new_df["total_comments"] > 0)].copy()

time_periods = ['2019-2021', '2022', '2023', '2024']

# Perform Fisher's Exact Test for each time period
for period in time_periods:
    period_df = filtered_df[filtered_df["time_period"] == period].copy()

    period_df["negative_comments"] = np.round(period_df["neg_percent"] * period_df["total_comments"]).astype(int)

    # Create contingency table: bin_venezuelan vs. sum of negative and non-negative comments
    contingency_table = period_df.groupby("bin_venezuelan").agg(
        negative_comments=("negative_comments", "sum"),
        total_comments=("total_comments", "sum")
    )

    # Add non-negative comments
    contingency_table["non_negative_comments"] = contingency_table["total_comments"] - contingency_table["negative_comments"]
    contingency_table = contingency_table.drop(columns=["total_comments"])

    # Ensure the table is 2x2
    if contingency_table.shape == (2, 2):
        stat, p_value = fisher_exact(contingency_table)
        print(f"Fisher's Exact Test for {period}: Statistic = {stat}, P-value = {p_value}")

        if p_value < 0.05:
            print("Significant association between bin_venezuelan and number of negative comments.")
        else:
            print("No significant association between bin_venezuelan and number of negative comments.")
    else:
        print(f"Skipping {period}: Contingency table is not 2x2, Fisher's Exact Test requires a 2x2 table.")

Topic 8 tests

In [None]:
# Filter dataset for topic 8 and remove videos with no comments

filtered_df = new_df[(new_df["assigned_topics"] == 8) & (new_df["total_comments"] > 0)].copy()

time_periods = ['2019-2021', '2022', '2023', '2024']

# Perform Fisher's Exact Test for each time period
for period in time_periods:
    period_df = filtered_df[filtered_df["time_period"] == period].copy()

    period_df["negative_comments"] = np.round(period_df["neg_percent"] * period_df["total_comments"]).astype(int)

    # Create contingency table: bin_venezuelan vs. sum of negative and non-negative comments
    contingency_table = period_df.groupby("bin_venezuelan").agg(
        negative_comments=("negative_comments", "sum"),
        total_comments=("total_comments", "sum")
    )

    # Add non-negative comments
    contingency_table["non_negative_comments"] = contingency_table["total_comments"] - contingency_table["negative_comments"]
    contingency_table = contingency_table.drop(columns=["total_comments"])

    # Ensure the table is 2x2
    if contingency_table.shape == (2, 2):
        stat, p_value = fisher_exact(contingency_table)
        print(f"Fisher's Exact Test for {period}: Statistic = {stat}, P-value = {p_value}")

        if p_value < 0.05:
            print("Significant association between bin_venezuelan and number of negative comments.")
        else:
            print("No significant association between bin_venezuelan and number of negative comments.")
    else:
        print(f"Skipping {period}: Contingency table is not 2x2, Fisher's Exact Test requires a 2x2 table.")

# Quantitative analysis: video  level visualizations

In [None]:
new_df.head()

Videos per topic over time

In [None]:
topic_dict = { 0: "0: Armed theft to businesses captured on security cameras",
              1: "1: Crime stories through interviews with victims' families",
               2: "2: Motorcycle crimes and witness involvement",
               3: "3: Neighborhood crime stories and self-organized measures",
               4: "4: Police intervention and gang capture cases",
               5: "5: Crimes linked to politics and public protests",
               6: "6: Extortion crimes and violent retaliation",
               7: "7: Robberies executed by organized crime groups",
               8: "8: Kidnapping crimes and police intervention",
               9: "9: Cybertheft through stolen devices",
               10: "10: Crimes involving dogs as victims, perpetrators or protectors" }


# Rename time periods
new_df["time_period"] = new_df["time_period"].replace({
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
})

# Count videos per topic and time period
topic_counts = new_df.groupby(['time_period', 'assigned_topics'])['video_id'].count().reset_index()

# Define markers
markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', 'h', '*', 'X']

# Plot
plt.figure(figsize=(13, 6))
for i, topic in enumerate(topic_counts["assigned_topics"].unique()):
    subset = topic_counts[topic_counts["assigned_topics"] == topic]
    plt.plot(subset["time_period"], subset["video_id"], marker=markers[i % len(markers)], label=topic_dict[topic])

plt.xlabel("Time Period")
plt.ylabel("Number of Videos")
plt.title("Topic Trends Over Time")
plt.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

Weight of each topic over time

In [None]:
# Rename time periods
new_df["time_period"] = new_df["time_period"].replace({
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
})

# Count videos per topic and time period
topic_counts = new_df.groupby(['time_period', 'assigned_topics'])['video_id'].count().reset_index()

# Calculate percentages
topic_counts['percentage'] = topic_counts.groupby('time_period')['video_id'].transform(lambda x: x / x.sum() * 100)

# Pivot data
pivot_df = topic_counts.pivot(index='time_period', columns='assigned_topics', values='percentage')

# Define patterns for each topic
hatch_patterns = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*', '']
topic_hatch_map = {topic: hatch_patterns[i % len(hatch_patterns)] for i, topic in enumerate(pivot_df.columns)}

# Plot
fig, ax = plt.subplots(figsize=(13, 6))
colors = plt.cm.tab10.colors
bottom = None
bars = {}
for i, topic in enumerate(pivot_df.columns):
    label = topic_dict[topic]
    bar = ax.bar(pivot_df.index, pivot_df[topic], label=label, color=colors[i % len(colors)], bottom=bottom)
    bars[topic] = bar
    bottom = pivot_df[topic] if bottom is None else bottom + pivot_df[topic]

for topic, bar_group in bars.items():
    for bar in bar_group:
        bar.set_hatch(topic_hatch_map[topic])

ax.set_xlabel("Time Period")
ax.set_ylabel("Percentage (%)")
ax.set_title("Distribution of Crime Topics Over Time")
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

Videos per topic over time (Venezuelan offenders)

In [None]:
# Filter the dataset to include only reports mentioning a Venezuelan offender

filtered_df = new_df[new_df["bin_venezuelan"] == 1]

topic_dict = { 0: "0: Armed theft to businesses captured on security cameras",
              1: "1: Crime stories through interviews with victims' families",
               2: "2: Motorcycle crimes and witness involvement",
               3: "3: Neighborhood crime stories and self-organized measures",
               4: "4: Police intervention and gang capture cases",
               5: "5: Crimes linked to politics and public protests",
               6: "6: Extortion crimes and violent retaliation",
               7: "7: Robberies executed by organized crime groups",
               8: "8: Kidnapping crimes and police intervention",
               9: "9: Cybertheft through stolen devices",
               10: "10: Crimes involving dogs as victims, perpetrators or protectors" }


# Rename time periods
filtered_df["time_period"] = filtered_df["time_period"].replace({
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
})

# Count videos per topic and period
topic_counts = filtered_df.groupby(['time_period', 'assigned_topics'])['video_id'].count().reset_index()

# Define markers
markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', 'h', '*', 'X']

# Plot
plt.figure(figsize=(13, 6))

for i, topic in enumerate(topic_counts["assigned_topics"].unique()):
    subset = topic_counts[topic_counts["assigned_topics"] == topic]
    plt.plot(subset["time_period"], subset["video_id"], marker=markers[i % len(markers)], label=topic_dict[topic])

plt.xlabel("Time Period")
plt.ylabel("Number of Videos")
plt.title("Topic Trends Over Time for Videos Mentioning Venezuelan Offenders")
plt.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

Weight of each topic over time (Venezuelan offenders)

In [None]:
# Rename time periods
filtered_df["time_period"] = filtered_df["time_period"].replace({
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
})

# Cound videos per topic and period
topic_counts = filtered_df.groupby(['time_period', 'assigned_topics'])['video_id'].count().reset_index()

# Calculate percentages
topic_counts['percentage'] = topic_counts.groupby('time_period')['video_id'].transform(lambda x: x / x.sum() * 100)

# Pivot data
pivot_df = topic_counts.pivot(index='time_period', columns='assigned_topics', values='percentage')

# Define patterns
hatch_patterns = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*', '']
topic_hatch_map = {topic: hatch_patterns[i % len(hatch_patterns)] for i, topic in enumerate(pivot_df.columns)}

# Plot
fig, ax = plt.subplots(figsize=(13, 6))
colors = plt.cm.tab10.colors
bottom = None
bars = {}

for i, topic in enumerate(pivot_df.columns):
    label = topic_dict[topic]
    bar = ax.bar(pivot_df.index, pivot_df[topic], label=label, color=colors[i % len(colors)], bottom=bottom)
    bars[topic] = bar
    bottom = pivot_df[topic] if bottom is None else bottom + pivot_df[topic]

for topic, bar_group in bars.items():
    for bar in bar_group:
        bar.set_hatch(topic_hatch_map[topic])

ax.set_xlabel("Time Period")
ax.set_ylabel("Percentage (%)")
ax.set_title("Distribution of Crime Topics Over Time")
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

Video counts per time period

In [None]:
# Extract year from video_date
new_df["year"] = pd.to_datetime(new_df["video_date"]).dt.year

# Rename time periods
new_df["time_period"] = new_df["time_period"].replace({
    'period_2019_2021': '2019-2021',
    'period_2022': '2022',
    'period_2023': '2023',
    'period_2024': '2024'
})

# Count videos per period
video_counts = new_df.groupby("time_period")["video_id"].count().reset_index()

# Get separate counts for 2019, 2020, 2021
years_2019_2021 = new_df[new_df["year"].isin([2019, 2020, 2021])].groupby("year")["video_id"].count().reset_index()

blue_shades = ["#aec6cf", "#6495ed", "#0f4c75"]
default_blue = "#1f77b4"

fig, ax = plt.subplots(figsize=(10, 5))

bottom_value = 0

for _, row in video_counts.iterrows():
    time_period = row["time_period"]
    count = row["video_id"]

    if time_period == "2019-2021":
        for i, (year, year_count) in enumerate(zip(years_2019_2021["year"], years_2019_2021["video_id"])):
            bar = ax.bar(time_period, year_count, color=blue_shades[i], bottom=bottom_value)
            ax.text(time_period, bottom_value + (year_count / 2), f"{year}: {int(year_count)}",
                    ha='center', va='center', fontsize=10, fontweight='bold')
            bottom_value += year_count
    else:
        bar = ax.bar(time_period, count, color=default_blue)
        ax.text(time_period, count + 20, f"{int(count)}", ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.text("2019-2021", bottom_value + 20, f"{int(bottom_value)}", ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_xlabel("Time Period")
ax.set_ylabel("Total Number of Videos")
ax.set_title("Total Videos Per Time Period")

plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()