In [25]:
import pandas as pd

# Examine the seed datasets
ISIS_seed = pd.read_csv('Seed_MIWS/Seed_Dataset/ISIS_Seed_Complete.csv')
WS_seed = pd.read_csv('Seed_MIWS/Seed_Dataset/WS_Seed_Complete.csv')

display(ISIS_seed.head())
display(WS_seed.head())

Unnamed: 0,Source,Type of Source,Text,Label,Geographical_Location,Author_Country_Affiliation,Unnamed: 6
0,"Chatfield et.al ""Tweeting propaganda, radicali...",Research Article,Coalition planes massacred these children in a...,Propaganda,Iraq,USA,
1,"Chatfield et.al ""Tweeting propaganda, radicali...",Research Article,these PKK fellas are exceptional liars.after t...,Propaganda,Iraq,USA,
2,"Chatfield et.al ""Tweeting propaganda, radicali...",Research Article,This is so awesome. US airstrikes also by mist...,Propaganda,Iraq,USA,
3,"Chatfield et.al ""Tweeting propaganda, radicali...",Research Article,RT @ImtiyazAzhar: Support &amp; love for #Isla...,Propaganda,India,USA,
4,"Chatfield et.al ""Tweeting propaganda, radicali...",Research Article,Ask the Americans how they liked fighting\nJTJ...,Propaganda,USA,USA,


Unnamed: 0,Source,Type_of_Source,Text,Ideology,Label,Geographical_Location,Author_Country_Affiliation,Unnamed: 7
0,"Ray and Marsh ""Recruitment by extremist groups...",Research Article,This is a deliberate choice of words. As we st...,White Supremacist,Propaganda,-,USA,
1,"Ray and Marsh ""Recruitment by extremist groups...",Research Article,Most victims of race crime - about 90 per cent...,White Supremacist,Propaganda,-,USA,
2,"Ray and Marsh ""Recruitment by extremist groups...",Research Article,WE BELIEVE that the Cananite Jew is the natura...,White Supremacist,Radicalization,-,USA,
3,"Ray and Marsh ""Recruitment by extremist groups...",Research Article,"The culture of a race, free of alien influence...",White Supremacist,Radicalization,-,USA,
4,"Ray and Marsh ""Recruitment by extremist groups...",Research Article,Influential organizations and much of the west...,White Supremacist,Propaganda,"Switzerland, Germany",USA,


In [94]:
import numpy as np

"""
Given a path to a .csv file, this function will read the file and format it as a dicionary with the following structure:
    
    {
        'Inputs': [message1, message2, ...],
        'Labels': [label1, label2, ...]
    }
    
    where 'Inputs' is a list of lists of message dictionaries that conform to the input to OpenAI's Chat Completion API,
    and 'Labels' is a list of labels for each message. Each label is a string that represents the type of extremist content of the message.

    Each message should have the following format:
    [
        {
            'role': 'system',
            'content': 'system message'
        },
        {
            'role': 'user',
            'content': 'tweet text'
        }
    ]
"""
def format_eval_openai(path, system_message, labels_present=True, limit=None):
    df = pd.read_csv(path, encoding='latin-1')
    inputs = []
    labels = []
    for index, row in df.iterrows():
        if limit and index >= limit:
            break
        tweet_text = row['Text']
        inputs.append([
            {
                'role': 'system',
                'content': system_message
            },
            {
                'role': 'user',
                'content': tweet_text
            }
        ])
        if labels_present:
            labels.append(row['Label'])
        else:
            labels.append("None")
    return {
        'Inputs': inputs,
        'Labels': labels
    }

In [95]:
"""
Given a path to a .csv file, this function will read the file and format it as a dicionary with the following structure:
        
        {
            'Inputs': [request1, request2, ...]),
            'Labels': [label1, label2, ...]
        }
        
        where 'Inputs' is a list of request dictionaries that conform to the input to Perspective's comments().analyze API,
        and 'Labels' is a list of labels for each message. Each label is a string that represents the type of extremist content of the message.
    
        Each request will have the following format:
        {
            'comment': { 'text': 'tweet text' },
            'requestedAttributes': metrics
        }

Args:
    path: The path to the .csv file
    metrics: A dictionary of the metrics to request from the Perspective API (example: {'TOXICITY': {}, 'SEVERE_TOXICITY': {}})
"""
def format_eval_perspective(path, metrics, labels_present=True, limit=None):
    df = pd.read_csv(path, encoding='latin-1')
    inputs = []
    labels = []
    for index, row in df.iterrows():
        if limit and index >= limit:
            break
        tweet_text = row['Text']
        inputs.append({
            'comment': { 'text': tweet_text },
            'requestedAttributes': metrics
        })
        if labels_present:
            labels.append(row['Label'])
        else:
            labels.append("None")
    return {
        'Inputs': inputs,
        'Labels': labels
    }

In [96]:
system_message = "You are on a social media platform's moderation team. "
system_message += "Your task is to decide whether the following tweet contains extremist content or not.\n"
system_message += "Extremist content includes messages that are considered to be propoganda, radicalization, or recruitment. "
system_message += "Please label the tweet as 'Propoganda', 'Radicalization', 'Recruitment', or 'None'.\n"
system_message += "Your output must be in JSON format and with the following structure:\n{\"Label\": \"Propoganda\" or \"Radicalization\" or \"Recruitment\" or \"None\"}"

print(system_message)

ISIS_dict_openai = format_eval_openai('Seed_MIWS/Seed_Dataset/ISIS_Seed_Complete.csv', system_message)
WS_dict_openai = format_eval_openai('Seed_MIWS/Seed_Dataset/WS_Seed_Complete.csv', system_message)
tweets_dict_openai = format_eval_openai("tweets.csv", system_message, labels_present=False, limit=1000)

metrics = {'IDENTITY_ATTACK': {}, 'SEVERE_TOXICITY': {}, 'THREAT': {}}
ISIS_dict_perspective = format_eval_perspective('Seed_MIWS/Seed_Dataset/ISIS_Seed_Complete.csv', metrics)
WS_dict_perspective = format_eval_perspective('Seed_MIWS/Seed_Dataset/WS_Seed_Complete.csv', metrics)
tweets_dict_perspective = format_eval_openai("tweets.csv", metrics, labels_present=False, limit=1000)

You are on a social media platform's moderation team. Your task is to decide whether the following tweet contains extremist content or not.
Extremist content includes messages that are considered to be propoganda, radicalization, or recruitment. Please label the tweet as 'Propoganda', 'Radicalization', 'Recruitment', or 'None'.
Your output must be in JSON format and with the following structure:
{"Label": "Propoganda" or "Radicalization" or "Recruitment" or "None"}


In [98]:
combined_dict_openai = {
    'Inputs': ISIS_dict_openai['Inputs'] + WS_dict_openai['Inputs'] + tweets_dict_openai['Inputs'],
    'Labels': ISIS_dict_openai['Labels'] + WS_dict_openai['Labels'] + tweets_dict_openai['Labels']
}

combined_dict_perspective = {
    'Inputs': ISIS_dict_perspective['Inputs'] + WS_dict_perspective['Inputs'] + tweets_dict_perspective['Inputs'],
    'Labels': ISIS_dict_perspective['Labels'] + WS_dict_perspective['Labels'] + tweets_dict_perspective['Labels']
}

In [99]:
print("Combined openAI dictionary:")
print(f"Inputs shape: {len(combined_dict_openai['Inputs'])}")
print(f"Labels shape: {len(combined_dict_openai['Labels'])}")

print("")

print("Combined Perspective dictionary:")
print(f"Inputs shape: {len(combined_dict_perspective['Inputs'])}")
print(f"Labels shape: {len(combined_dict_perspective['Labels'])}")

Combined openAI dictionary:
Inputs shape: 1398
Labels shape: 1398

Combined Perspective dictionary:
Inputs shape: 1398
Labels shape: 1398


In [100]:
# Save the combined dictionary to a new .json file at the specified path
import json
json_file_openai = json.dumps(combined_dict_openai, indent=4)
json_file_perspective = json.dumps(combined_dict_perspective, indent=4)

with open("eval_data_openai.json", "w") as file:
    file.write(json_file_openai)

with open("eval_data_perspective.json", "w") as file:
    file.write(json_file_perspective)

In [101]:
print(combined_dict_openai['Inputs'][0])

[{'role': 'system', 'content': 'You are on a social media platform\'s moderation team. Your task is to decide whether the following tweet contains extremist content or not.\nExtremist content includes messages that are considered to be propoganda, radicalization, or recruitment. Please label the tweet as \'Propoganda\', \'Radicalization\', \'Recruitment\', or \'None\'.\nYour output must be in JSON format and with the following structure:\n{"Label": "Propoganda" or "Radicalization" or "Recruitment" or "None"}'}, {'role': 'user', 'content': 'Coalition planes massacred these children in airstrikes\nOn #Hit ,#Anbar\nhttp://t.co/yCsEgkwDY6#Iraq'}]


In [102]:
print(combined_dict_perspective['Inputs'][0])

{'comment': {'text': 'Coalition planes massacred these children in airstrikes\nOn #Hit ,#Anbar\nhttp://t.co/yCsEgkwDY6#Iraq'}, 'requestedAttributes': {'IDENTITY_ATTACK': {}, 'SEVERE_TOXICITY': {}, 'THREAT': {}}}
