<a href="https://colab.research.google.com/github/marfrlv/gpt-propaganda-detection/blob/main/gpt_api_propaganda_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Libraries

In [None]:
! pip install openai



In [None]:
# necessary libraries
from openai import OpenAI
import pandas as pd
import os
from io import StringIO
import io
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 2. Data preparation

In [None]:
# paths to the dfs with annotations
train_df_path = '/content/drive/MyDrive/thesis/semeval_2020_task_11_corpus/train_labels_task2_with_spans.csv'
dev_df_path = '/content/drive/MyDrive/thesis/semeval_2020_task_11_corpus/dev_labels_df_with_spans.csv'

# paths to the folders with articles
train_art_fold = '/content/drive/MyDrive/thesis/semeval_2020_task_11_corpus/train-articles'
dev_art_fold = '/content/drive/MyDrive/thesis/semeval_2020_task_11_corpus/dev-articles'

# path to the folder with prompts
prompts_fold_path = '/content/drive/MyDrive/thesis/prompts'

# additional examples for the few-shot condition
add_ex_path = '/content/drive/MyDrive/thesis/semeval_2020_task_11_corpus/additional_examples_for_chatgpt.csv'

In [None]:
# the examples for the few-shot condition were extracted from the articles from the dataset, so they have to be removed from the experiment
add_ex_df = pd.read_csv(add_ex_path)
add_ex_df.head()

Unnamed: 0,id,technique,beginning_offset,ending_offset,text_span
0,776373795,Appeal_to_Authority,4205,4572,"In his tweets, Trump also quoted criticism of ..."
1,789370909,Appeal_to_Authority,9135,9210,I believe her because she is telling the truth...
2,765197039,Appeal_to_Authority,2881,2962,"Muhammad advises the same thing, according to ..."
3,728758697,Doubt,1512,1584,Would a Republican get a pass on meeting with ...
4,725824328,Doubt,4001,4406,"Reading Obama’s 1995 memoir, you might almost ..."


In [None]:
# check if there are 3 additional examples for each technique
value_counts = add_ex_df['technique'].value_counts()
value_counts

technique
Appeal_to_Authority                   3
Doubt                                 3
Repetition                            3
Appeal_to_fear-prejudice              3
Slogans                               3
Black-and-White_Fallacy               3
Loaded_Language                       3
Flag-Waving                           3
Name_Calling,Labeling                 3
Whataboutism,Straw_Men,Red_Herring    3
Causal_Oversimplification             3
Exaggeration,Minimisation             3
Bandwagon,Reductio_ad_hitlerum        3
Thought-terminating_Cliches           3
Name: count, dtype: int64

In [None]:
unique_ids = add_ex_df['id'].unique()
unique_ids_l = unique_ids.tolist()

#trial_ids = [788056108, 769962328, 738207834, 786527921, 728972961] # articles already used while tuning the procedure
#for id in trial_ids:
  #unique_ids_l.append(id)

unique_ids_l_str = []
for el in unique_ids_l:
  el_str = str(el)
  unique_ids_l_str.append(el_str)


print(f"There are {len(unique_ids_l_str)} articles to ignore while conducting the experiment.")

There are 37 articles to ignore while conducting the experiment.


In [None]:
# training articles set
def extract_articles_with_ids(train_art_fold, unique_ids_l_str):
    articles = []
    article_ids = []

    files = os.listdir(train_art_fold)

    for file_name in files:
        if file_name.endswith('.txt'):
            file_id = file_name.split('.')[0]
            if file_id not in unique_ids_l_str:  # check if the id is not in the list of ids to exclude
                file_path = os.path.join(train_art_fold, file_name)
                with open(file_path, 'r') as file:
                    file_content = file.read()
                    articles.append(file_content)
                    article_ids.append(file_id)

    return articles, article_ids

# apply the function
train_art_output = extract_articles_with_ids(train_art_fold, unique_ids_l_str)
articles = train_art_output[0]
article_ids = train_art_output[1]

print(f'{len(articles)} articles and {len(article_ids)} IDs have been extracted.')

334 articles and 334 IDs have been extracted.


In [None]:
# development articles set
d_files = os.listdir(dev_art_fold)

# texts
for file_name in d_files:
    if file_name.endswith('.txt'):
        file_path = os.path.join(dev_art_fold, file_name)
        with open(file_path, 'r') as file:
            file_content = file.read()
            articles.append(file_content)
# ids
for file_name in d_files:
  if file_name.endswith('.txt'):
    file_id = file_name.split('.')[0]  # extract the numeric part before the '.txt' extension
    article_ids.append(file_id)

In [None]:
# combine trial ids and trial texts into one list, so ChatGPT is able to extract them during processing
ids_texts_combined = [f"{id}: {text}" for id, text in zip(article_ids, articles)]

# 446 (371 training + 75 development) - 37 (training excluded) = 409
print(f'There are {len(ids_texts_combined)} items in the final dataset.')

There are 409 items in the final dataset.


# 3. ChatGPT API

## 3.1. One-shot condition


In [None]:
# the function which sends the given prompt to the api and returns ChatGPT output, takes an article and a temperature parameter value
def detect_propaganda_with_chatgpt(prompt, article_text, temperature):
  client = OpenAI(api_key='***') # insert the API key here
  completion = client.chat.completions.create(
    model="***", # choose the model needed
    messages=[
      {"role": "system", "content": prompt},

      {"role": "user", "content": f"""Here is the article:{article_text}"""},
    ],
    temperature=temperature
  )
  return (completion.choices[0].message.content)

In [None]:
prompt = "..." # insert the prompt for the model (what you want it to do)

In [None]:
# Base prompt
# One-shot condition (one illustrating example)
# Temperature: 0
outputs_folder = "/content/drive/MyDrive/thesis/base_prompt_one_shot/chatgpt_outputs_temp0"

for i, text in enumerate(tqdm(ids_texts_combined, desc="ChatGPT is detecting propaganda...")):
    output = detect_propaganda_with_chatgpt(base_one_shot_prompt, text, 0)
    file_name = os.path.join(outputs_folder, f"base_one-shot_output_{i+1}.txt")  # generate a unique file path for each element
    with open(file_name, 'w') as file:
        file.write(str(output))

ChatGPT is detecting propaganda...: 100%|██████████| 409/409 [2:30:14<00:00, 22.04s/it]


In [None]:
# CoT prompt
# One-shot condition (one illustrating example)
# Temperature: 0
outputs_folder = "/content/drive/MyDrive/thesis/CoT_prompt_one_shot/temperature_0/chatgpt_outputs_temp0"

for i, text in enumerate(tqdm(ids_texts_combined, desc="ChatGPT is detecting propaganda...")):
    output = detect_propaganda_with_chatgpt(cot_one_shot_prompt, text, 0)
    file_name = os.path.join(outputs_folder, f"cot_one-shot_output_temp0{i+1}.txt")  # generate a unique file path for each element
    with open(file_name, 'w') as file:
        file.write(str(output))

ChatGPT is detecting propaganda...: 100%|██████████| 409/409 [2:57:23<00:00, 26.02s/it]


## 3.2. Few-shot condition

In [None]:
# Base prompt
# Few-shot condition (four illustrating examples)
# Temperaure: 0
outputs_folder = "/content/drive/MyDrive/thesis/base_prompt_few_shot/temperature_0/chatgpt_outputs_temp0"

for i, text in enumerate(tqdm(ids_texts_combined, desc="ChatGPT is detecting propaganda...")):
    output = detect_propaganda_with_chatgpt(base_few_shot_prompt, text, 0)
    file_name = os.path.join(outputs_folder, f"base_few-shot_output_temp0_{i+1}.txt")  # generate a unique file path for each element
    with open(file_name, 'w') as file:
        file.write(str(output))

ChatGPT is detecting propaganda...: 100%|██████████| 409/409 [2:41:11<00:00, 23.65s/it]


In [None]:
# CoT prompt
# Few-shot condition (four illustrating examples)
# Temperature: 0
outputs_folder = "/content/drive/MyDrive/thesis/CoT_prompt_few_shot/temperature_0/chatgpt_outputs_temp0"

for i, text in enumerate(tqdm(ids_texts_combined, desc="ChatGPT is detecting propaganda...")):
    output = detect_propaganda_with_chatgpt(cot_few_shot_prompt, text, 0)
    file_name = os.path.join(outputs_folder, f"cot_few-shot_output_temp0_{i+1}.txt")  # generate a unique file path for each element
    with open(file_name, 'w') as file:
        file.write(str(output))

ChatGPT is detecting propaganda...: 100%|██████████| 409/409 [2:34:03<00:00, 22.60s/it]
