In [1]:
import torch
import numpy as np
import networkx as nx
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
from tqdm import tqdm

def _make_logits_consistent(x, R):
    c_out = x.unsqueeze(1) + 10
    c_out = c_out.expand(len(x), R.shape[1], R.shape[1])
    R_batch = R.expand(len(x), R.shape[1], R.shape[1]).to(x.device)
    final_out, _ = torch.max(R_batch * c_out, dim=2)
    return final_out - 10

def generate_labels_for_text(model_dir, texts):
    # Initialize the graph and other necessary components
    G = nx.DiGraph()
    G.add_edge("ROOT", "Logos")
    G.add_edge("Logos", "Repetition")
    G.add_edge("Logos", "Obfuscation, Intentional vagueness, Confusion")
    G.add_edge("Logos", "Reasoning")
    G.add_edge("Logos", "Justification")
    G.add_edge("Justification", "Slogans")
    G.add_edge("Justification", "Bandwagon")
    G.add_edge("Justification", "Appeal to authority")
    G.add_edge("Justification", "Flag-waving")
    G.add_edge("Justification", "Appeal to fear/prejudice")
    G.add_edge("Reasoning", "Simplification")
    G.add_edge("Simplification", "Causal Oversimplification")
    G.add_edge("Simplification", "Black-and-white Fallacy/Dictatorship")
    G.add_edge("Simplification", "Thought-terminating cliché")
    G.add_edge("Reasoning", "Distraction")
    G.add_edge("Distraction", "Misrepresentation of Someone's Position (Straw Man)")
    G.add_edge("Distraction", "Presenting Irrelevant Data (Red Herring)")
    G.add_edge("Distraction", "Whataboutism")
    G.add_edge("ROOT", "Ethos")
    G.add_edge("Ethos", "Appeal to authority")
    G.add_edge("Ethos", "Glittering generalities (Virtue)")
    G.add_edge("Ethos", "Bandwagon")
    G.add_edge("Ethos", "Ad Hominem")
    G.add_edge("Ethos", "Transfer")
    G.add_edge("Ad Hominem", "Doubt")
    G.add_edge("Ad Hominem", "Name calling/Labeling")
    G.add_edge("Ad Hominem", "Smears")
    G.add_edge("Ad Hominem", "Reductio ad hitlerum")
    G.add_edge("Ad Hominem", "Whataboutism")
    G.add_edge("ROOT", "Pathos")
    G.add_edge("Pathos", "Exaggeration/Minimisation")
    G.add_edge("Pathos", "Loaded Language")
    G.add_edge("Pathos", "Appeal to (Strong) Emotions")
    G.add_edge("Pathos", "Appeal to fear/prejudice")
    G.add_edge("Pathos", "Flag-waving")
    G.add_edge("Pathos", "Transfer")
    
    # Load the best model from the specified directory
    tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)

    # Move the model to the GPU device if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define R matrix and other necessary components
    A = nx.to_numpy_array(G)
    R = np.zeros(A.shape)
    np.fill_diagonal(R, 1)
    g = nx.DiGraph(A)
    for i in range(len(A)):
        descendants = list(nx.descendants(g, i))
        if descendants:
            R[i, descendants] = 1
    R = torch.tensor(R)
    # Transpose to get the ancestors for each node
    R = R.transpose(1, 0)
    R = R.unsqueeze(0)

    n_map = dict((n, i) for i, n in enumerate(G.nodes))

    A = nx.to_numpy_array(G).transpose()
    R = np.zeros(A.shape)
    np.fill_diagonal(R, 1)
    g = nx.DiGraph(A)
    for i in range(len(A)):
        descendants = list(nx.descendants(g, i))
        if descendants:
            R[i, descendants] = 1
    R = torch.tensor(R)
    # Transpose to get the ancestors for each node
    R = R.transpose(1, 0)
    R = R.unsqueeze(0)

    # Initialize a list to store the predicted labels for each text
    all_predicted_labels = []
    # all_predicted_probabilities = []

    # Iterate over the list of texts
    for text in tqdm(texts):
        # Encode the input text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,  # Assuming the same max_length as used during training
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Perform inference
        with torch.no_grad():
            outputs = model(
                input_ids=encoding["input_ids"].to(device),
                attention_mask=encoding["attention_mask"].to(device),
            )
        logits = outputs.logits
        # Adjust logits if needed
        logits = outputs.logits
        logits = _make_logits_consistent(logits, R)
        logits[:, 0] = -1.0
        logits = logits > 0.0
        # Convert logits to labels
        labels = np.array(G.nodes)[logits[0].cpu().nonzero()].flatten().tolist()
        # probabilities = torch.sigmoid(logits[0].float()).cpu().numpy()

        # Append the predicted labels for the current text to the list
        all_predicted_labels.append(labels)
        # all_predicted_probabilities.append(probabilities)

    return all_predicted_labels
    # return all_predicted_labels, all_predicted_probabilities

# Assuming model_dir and texts are provided
best_model_dir = "../../what-do-you-meme/subtask1/models/mBART-with-hierarchy-redo/best_model_lr5e-05_maxlen256_batchsize64_epoch3"
texts_to_label = [
    "THIS IS WHY YOU NEED. A SHARPIE WITH YOU AT ALL TIMES.",
    "WHEN YOU'RE THE FBI, THEY LET YOU DO IT.",
    "Move your ships away!\n\noooook\n\nMove your ships away!\n\nNo, and I just added 10 more",
    "Let's Make America Great Again!"

]

# Generate labels for the list of texts using the best model
# all_predicted_labels, all_predicted_probabilities = generate_labels_for_text(best_model_dir, texts_to_label)
all_predicted_labels = generate_labels_for_text(best_model_dir, texts_to_label)

# Print the generated labels for each text
for text, labels in zip(texts_to_label, all_predicted_labels):
    print(f"Generated Labels:", labels)
    
# Print the generated probabilities for each text
# for text, probabilities in zip(texts_to_label, all_predicted_probabilities):
#     print(f"Generated Probabilities:", probabilities)

100%|██████████| 4/4 [00:00<00:00,  9.26it/s]


Generated Labels: ['Logos', 'Reasoning', 'Simplification', 'Thought-terminating cliché']
Generated Labels: ['Logos', 'Reasoning', 'Simplification', 'Thought-terminating cliché']
Generated Labels: []
Generated Labels: ['Logos', 'Justification', 'Slogans', 'Flag-waving', 'Pathos']


In [2]:
# read the txt file
with open('../../march-of-nationalism/data/UK-R-migr-RA-twit-meta-2012-2022-tweets-only.txt', 'r') as f:
    docs_UK = f.read().splitlines()

docs_UK[:5]

["RT John Denham can't bring himself to admit immigration is too high. If Labour can't do that, this 'apology' is meaningless.",
 'Just raised issue in HofC of support for Syrian refugees in Turkey, with Foreign Sec, encouraging response with financial support.',
 'We made a clear promise to the British public ... this gov would reduce and control immigration - Home Secretary ',
 '‘We’re delivering on our promise to control immigration’ – Home Secretary Theresa May ',
 'In her speech today the Home Secretary addressed some common misconceptions about immigration ']

In [3]:
"""Label the UK Tweets with the best classifier"""

import json

def tweet_labels(tweets, output_file_path, model_dir):
    
    outputs_predictions = generate_labels_for_text(model_dir, tweets)
    outputs = []
    for tweet, predictions in zip(tweets, outputs_predictions):

        result_item = {
            "tweet": tweet,
            "labels": predictions
        }

        outputs.append(result_item)

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        json.dump(outputs, output_file, ensure_ascii=False, indent=2)

# Example usage
output_file_path_UK = '../data/UK-migr-2012-2022-tweets-persuasion.json'

tweet_labels(docs_UK, output_file_path_UK, best_model_dir)

print(f"Labels generated and saved to {output_file_path_UK}.")

100%|██████████| 6457/6457 [02:48<00:00, 38.24it/s]


Labels generated and saved to ../data/UK-migr-2012-2022-tweets-persuasion.json.


In [4]:
# read the txt file
with open('../../march-of-nationalism/data/FR-R-MIGR-TWIT-2011-2022_meta-tweets-only.txt', 'r') as f:
    docs_French = f.read().splitlines()

docs_French[:5]

["Dans cette vidéo, je dénonçais l'arnaque des bons sentiments en matière d'immigration ",
 "Je démonte les idées reçues sur l'immigration ",
 "A 15H aux questions d'actualités, j'interogerai M. Guéant sur les faits de délinquance liés à l'immigration clandestine venue de Tunisie",
 'Nicolas Bay invité de « Objectif Elysée » en débat sur l’immigration | Front National: ',
 'Guéant reconnaît le bilan dramatique de la politique d’immigration de #Sarkozy | Front National: ']

In [5]:
output_file_path_French = '../data/FR-migr-2011-2022-tweets-persuasion.json'

tweet_labels(docs_French, output_file_path_French, best_model_dir)

print(f"Labels generated and saved to {output_file_path_French}.")

100%|██████████| 11489/11489 [05:05<00:00, 37.61it/s]


Labels generated and saved to ../data/FR-migr-2011-2022-tweets-persuasion.json.


In [6]:
import json
import plotly.graph_objects as go
from collections import Counter
import networkx as nx

def preprocess_tweets(data):
    tweets_by_label = {}
    for item in data:
        tweet = item['tweet']
        labels = item['labels']
        for label in labels:
            if label not in tweets_by_label:
                tweets_by_label[label] = []
            tweets_by_label[label].append(tweet)
    return tweets_by_label

def comparable_sorting_visualize_label_percentages_interactive(uk_json_file_path, fr_json_file_path, plot_title, save_path=None, width=1200):
    # Read UK JSON file
    with open(uk_json_file_path, 'r', encoding='utf-8') as uk_file:
        uk_data = json.load(uk_file)
    
    # Preprocess UK tweets to create a dictionary of tweets by label
    uk_tweets_by_label = preprocess_tweets(uk_data)
    
    # Read French JSON file
    with open(fr_json_file_path, 'r', encoding='utf-8') as fr_file:
        fr_data = json.load(fr_file)
    
    # Preprocess French tweets to create a dictionary of tweets by label
    fr_tweets_by_label = preprocess_tweets(fr_data)
    
    # Extract labels from UK JSON data
    uk_all_labels = [label for item in uk_data for label in item['labels'] if label != "Pathos"]
    
    # Count occurrences of each label in UK data
    uk_label_counts = Counter(uk_all_labels)
    
    # Extract labels from French JSON data
    fr_all_labels = [label for item in fr_data for label in item['labels'] if label != "Pathos"]
    
    # Count occurrences of each label in French data
    fr_label_counts = Counter(fr_all_labels)
    
    # Create directed graph for sorting
    G = nx.DiGraph()
    G.add_edge("ROOT", "Logos")
    G.add_edge("Logos", "Repetition")
    G.add_edge("Logos", "Obfuscation, Intentional vagueness, Confusion")
    G.add_edge("Logos", "Reasoning")
    G.add_edge("Logos", "Justification")
    G.add_edge("Justification", "Slogans")
    G.add_edge("Justification", "Bandwagon")
    G.add_edge("Justification", "Appeal to authority")
    G.add_edge("Justification", "Flag-waving")
    G.add_edge("Justification", "Appeal to fear/prejudice")
    G.add_edge("Reasoning", "Simplification")
    G.add_edge("Simplification", "Causal Oversimplification")
    G.add_edge("Simplification", "Black-and-white Fallacy/Dictatorship")
    G.add_edge("Simplification", "Thought-terminating cliché")
    G.add_edge("Reasoning", "Distraction")
    G.add_edge("Distraction", "Misrepresentation of Someone's Position (Straw Man)")
    G.add_edge("Distraction", "Presenting Irrelevant Data (Red Herring)")
    G.add_edge("Distraction", "Whataboutism")
    G.add_edge("ROOT", "Ethos")
    G.add_edge("Ethos", "Appeal to authority")
    G.add_edge("Ethos", "Glittering generalities (Virtue)")
    G.add_edge("Ethos", "Bandwagon")
    G.add_edge("Ethos", "Ad Hominem")
    G.add_edge("Ethos", "Transfer")
    G.add_edge("Ad Hominem", "Doubt")
    G.add_edge("Ad Hominem", "Name calling/Labeling")
    G.add_edge("Ad Hominem", "Smears")
    G.add_edge("Ad Hominem", "Reductio ad hitlerum")
    G.add_edge("Ad Hominem", "Whataboutism")
    G.add_edge("ROOT", "Pathos")
    G.add_edge("Pathos", "Exaggeration/Minimisation")
    G.add_edge("Pathos", "Loaded Language")
    G.add_edge("Pathos", "Appeal to (Strong) Emotions")
    G.add_edge("Pathos", "Appeal to fear/prejudice")
    G.add_edge("Pathos", "Flag-waving")
    G.add_edge("Pathos", "Transfer")

    # Topologically sort labels
    sorted_labels = list(nx.topological_sort(G))
    
    # Remove labels not found in the data
    sorted_labels = [label for label in sorted_labels if label in uk_label_counts or label in fr_label_counts]
    
    # Initialize lists for storing data for both UK and French
    uk_percentages = []
    fr_percentages = []
    uk_hover_texts = []
    fr_hover_texts = []
    
    # Calculate total counts for UK and French
    uk_total_count = sum(uk_label_counts.values())
    fr_total_count = sum(fr_label_counts.values())
    
    # Iterate through sorted labels to populate data for both UK and French
    for label in sorted_labels:
        # UK data
        uk_count = uk_label_counts.get(label, 0)
        uk_percentage = uk_count / uk_total_count * 100 if uk_total_count > 0 else 0
        uk_percentages.append(uk_percentage)
        uk_hover_text = ''
        if label in uk_tweets_by_label:
            uk_tweets = uk_tweets_by_label[label][:5]  # Limit to 5 tweets
            uk_hover_text = f'UK {label}:<br>' + '<br>'.join(uk_tweets)
        uk_hover_texts.append(uk_hover_text)
        
        # French data
        fr_count = fr_label_counts.get(label, 0)
        fr_percentage = fr_count / fr_total_count * 100 if fr_total_count > 0 else 0
        fr_percentages.append(fr_percentage)
        fr_hover_text = ''
        if label in fr_tweets_by_label:
            fr_tweets = fr_tweets_by_label[label][:5]  # Limit to 5 tweets
            fr_hover_text = f'French {label}:<br>' + '<br>'.join(fr_tweets)
        fr_hover_texts.append(fr_hover_text)
    
    # Create bar chart
    fig = go.Figure()
    # Add UK bar
    fig.add_trace(go.Bar(
        x=uk_percentages,
        y=sorted_labels,
        name='UK',
        orientation='h',
        hovertext=uk_hover_texts,
        marker=dict(color='crimson')
    ))
    # Add French bar
    fig.add_trace(go.Bar(
        x=fr_percentages,
        y=sorted_labels,
        name='French',
        orientation='h',
        hovertext=fr_hover_texts,
        marker=dict(color='darkcyan')
    ))
    
    # Customize layout
    fig.update_layout(
        title=plot_title,
        xaxis_title='Percentage',
        yaxis_title='Labels',
        hovermode='closest',
        height=800,
        width=width,
        barmode='group'  # Group bars
    )
    
    # Save plot as HTML file if save_path is provided
    if save_path:
        fig.write_html(save_path)
    else:
        # Show plot
        fig.show()

# Example usage:
uk_json_file_path = '../data/UK-migr-2012-2022-tweets-persuasion.json'
fr_json_file_path = '../data/FR-migr-2011-2022-tweets-persuasion.json'
save_path = '../plots/interactive-persuasion-comparison-with-examples.html'
comparable_sorting_visualize_label_percentages_interactive(uk_json_file_path, fr_json_file_path, plot_title='Type of Persuasion used in UK and French Tweets', save_path=save_path)
comparable_sorting_visualize_label_percentages_interactive(uk_json_file_path, fr_json_file_path, plot_title='Type of Persuasion used in UK and French Tweets')