# ChatGPT: Data collection via OpenAI API

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import os
from datetime import datetime
from tqdm import tqdm
import json

#### Prepare annotated data

In [None]:
# test-train split
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True).tolist()
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True).tolist()

In [None]:
X_test_c = [x.lower() for x in X_test]

In [None]:
test_df = pd.DataFrame({"X_test":X_test, "y_test":y_test, "q_match":X_test_c})
test_df.shape

In [None]:
# all annotations
labels = pd.read_csv("data/annotations.csv")
#labels.head()

In [None]:
# add date
merged = pd.merge(test_df, labels[['q_match', 'short_date']], how='left')
merged.shape

In [None]:
# text date column
merged['short_date'] = pd.to_datetime(merged['short_date'])
merged['date_strings'] = merged['short_date'].dt.strftime('%B %Y')

In [None]:
## knowledge cut off chat gpt = Sep 2021
cutoff_date = pd.to_datetime('2021-09-30')

In [None]:
merged['cutoff'] = np.where(merged['short_date']<= cutoff_date, 'before', 'after')
merged.cutoff.value_counts(dropna=False)

In [None]:
merged_dates = merged[merged["cutoff"]=='before']

## Predict using Chatgpt

In [None]:
import time
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [None]:
openai.api_key = "INSERT_YOUR_API_KEY"

In [None]:
# adds a delay to Completion API call
def delayed_completion(delay_in_seconds: float = 1, **kwargs):
    """Delay a completion by a specified amount of time."""

    # Sleep for the delay
    time.sleep(delay_in_seconds)

    # Call the Completion API and return the result
    return openai.ChatCompletion.create(**kwargs)

In [None]:
# retries the API call with exponential backoff
@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    retry_error_callback=lambda x: isinstance(x, openai.error.APIError),
)
def call_openai_api_with_backoff(prompt, delay):
    response = delayed_completion(
        delay_in_seconds=delay,
        model="gpt-3.5-turbo",
        temperature=0.2,
        messages=[{'role':'user', 'content':prompt}]
    )
    return response

In [None]:
# answer extraction
@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    retry_error_callback=lambda x: isinstance(x, openai.error.APIError),
)
def call_openai_api_with_backoff2(prompt, answer, delay):
    followup = "Therefore, the answer (yes or no) is"
    response = delayed_completion(
        delay_in_seconds=delay,
        model="gpt-3.5-turbo",
        temperature=0.2,
        messages=[{'role':'user', 'content':prompt},
                  {'role':'assistant', 'content':answer},
                 {'role':'user', 'content':followup}]
    )
    return response

In [None]:
def gpt_labels_delayed(sq, date, delay, full_prompt):
    # reasoning extraction
    try:
        response=call_openai_api_with_backoff(full_prompt, delay)
        message = response['choices'][0]['message']['content']
        finish_reason = response['choices'][0]['finish_reason']
        
    except:
        print("First API request failed after multiple retries or timed out for search query:", sq)
        output = {'sq':sq,
                 'date':date,
                'message':None,
                 'finish_reason':None,
                 'message2':None,
                 'finish_reason2':None}
        return output
        
    if message: 
        # answer extraction
        try:
            response2 = call_openai_api_with_backoff2(full_prompt, message, delay)
            message2 = response2['choices'][0]['message']['content']
            finish_reason2 = response2['choices'][0]['finish_reason']

            output = {'sq':sq,
                     'date':date,
                    'message':message,
                     'finish_reason':finish_reason,
                     'message2':message2,
                     'finish_reason2':finish_reason2}
            return output

        except:
            print("Second API request failed after multiple retries or timed out for search query:", sq)
            output = {'sq':sq,
                     'date':date,
                    'message':None,
                     'finish_reason':None,
                     'message2':None,
                     'finish_reason2':None}
            return output

In [None]:
## set rate limit per minute (max = 3000?)
rate_limit_per_minute = 3000
delay = 60.0 / rate_limit_per_minute

## No dates

In [None]:
# long definition, no date

In [None]:
long_nodate = []
for sq, date in tqdm(zip(merged.X_test.tolist(),merged.date_strings.tolist()), miniters=1):
    # prompt
    prompt = f'Is the search query "{sq}" political or news-related when searched in the Netherlands (yes or no)? Answer yes or no for whether the search query is political or news-related. Give your reasoning.'
    explain = "\nPolitical or news-related search queries are defined as seeking information contributing to opinion formation on political and societal topics. This includes search queries about (international and national) political actors (e.g., political parties, politicians), elections, policy, political events (e.g., statements from political actors), news media (e.g., nos.nl, talk shows or programs that focus on societal themes (e.g., Op1, Boos, Zembla) (e.g., RTL Nieuws, NOS journaal vandaag) or figures in these media (e.g., Tim Hofman).\nIt also includes search queries seeking out general information or news about societal themes (e.g., climate change, immigration, COVID-19, LGBT+, racism, crime, economy, war, etc.), but excludes those about practical information about these themes (e.g., checking pension benefits, getting vaccinated). In cases where it is unclear whether the search term is seeking general information or news about societal themes or practical information, follow the following rule: If the searcher's intention can be interpreted as interested in finding news or information about societal themes as well as practical, then answer yes (e.g., wait time for booster shot, easing of restrictions in France). If the search term can only be interpreted as seeking practical information, then answer no (e.g., vaccination line for Jansen, I want to get vaccinated).\nA political or news-related search query can also be related to current events about political or societal themes (e.g., COVID-19, train strikes, The Voice schandal).\nPolitical and news-related search queries are not about (natural) disasters (e.g., earthquakes, accidents), entertainment news (e.g., celebrities, fashion, gadgets, food), sports, culture (e.g., music radio, festivals), unless  when they concern policy related to these themes. A search term is never political or news-related when it concerns, for example, practical information (e.g., how long can you wear contact lenses, temperature tomorrow), shopping (e.g. IKEA Malm) or health."
    full_prompt = prompt+explain
    
    # predict
    output = gpt_labels_delayed(sq, date, delay, full_prompt)
    # dump search term + prediction dict to document
    with open('data/chatgpt_labels/long_nodate.json', 'a') as file: # appending to existing content
        json.dump(output, file)
        file.write('\n')
        
    # append to notebook file
    long_nodate.append(output)

In [None]:
pd.DataFrame(long_nodate).to_csv("data/chatgpt_labels/long_nodate.csv", index=False)

In [None]:
print(full_prompt)

In [None]:
# short prompt, no date

In [None]:
short_nodate = []
for sq, date in tqdm(zip(merged.X_test.tolist(),merged.date_strings.tolist()), miniters=1):
    
    # prompt
    prompt = f'Is the search query "{sq}" political or news-related when searched in the Netherlands (yes or no)? Political or news-related search queries are defined as seeking information contributing to opinion formation on political and societal topics. Give your reasoning.'
    # predict
    output = gpt_labels_delayed(sq, date, delay, prompt)
    
    # dump search term + prediction dict to document
    with open('data/chatgpt_labels/short_nodate.json', 'a') as file: # appending to existing content
        json.dump(output, file)
        file.write('\n')
        
    # append to notebook file
    short_nodate.append(output)

In [None]:
pd.DataFrame(short_nodate).to_csv("data/chatgpt_labels/short_nodate.csv", index=False)

In [None]:
print(prompt)

## Dates

In [None]:
# long definition, with date

In [None]:
long_date = []
for sq, date in tqdm(zip(merged_dates.X_test.tolist(),merged_dates.date_strings.tolist()), miniters=1):
    # prompt
    prompt_date = f'Is the search query "{sq}" political or news-related when searched in {date} in the Netherlands (yes or no)? Answer yes or no for whether the search query is political or news-related. Give your reasoning.'
    explain = "\nPolitical or news-related search queries are defined as seeking information contributing to opinion formation on political and societal topics. This includes search queries about (international and national) political actors (e.g., political parties, politicians), elections, policy, political events (e.g., statements from political actors), news media (e.g., nos.nl, talk shows or programs that focus on societal themes (e.g., Op1, Boos, Zembla) (e.g., RTL Nieuws, NOS journaal vandaag) or figures in these media (e.g., Tim Hofman).\nIt also includes search queries seeking out general information or news about societal themes (e.g., climate change, immigration, COVID-19, LGBT+, racism, crime, economy, war, etc.), but excludes those about practical information about these themes (e.g., checking pension benefits, getting vaccinated). In cases where it is unclear whether the search term is seeking general information or news about societal themes or practical information, follow the following rule: If the searcher's intention can be interpreted as interested in finding news or information about societal themes as well as practical, then answer yes (e.g., wait time for booster shot, easing of restrictions in France). If the search term can only be interpreted as seeking practical information, then answer no (e.g., vaccination line for Jansen, I want to get vaccinated).\nA political or news-related search query can also be related to current events about political or societal themes (e.g., COVID-19, train strikes, The Voice schandal).\nPolitical and news-related search queries are not about (natural) disasters (e.g., earthquakes, accidents), entertainment news (e.g., celebrities, fashion, gadgets, food), sports, culture (e.g., music radio, festivals), unless  when they concern policy related to these themes. A search term is never political or news-related when it concerns, for example, practical information (e.g., how long can you wear contact lenses, temperature tomorrow), shopping (e.g. IKEA Malm) or health."
    full_prompt = prompt_date+explain
    
    # predict
    output = gpt_labels_delayed(sq, date, delay, full_prompt)
    
    # dump search term + prediction dict to document
    with open('data/chatgpt_labels/long_date.json', 'a') as file: # appending to existing content
        json.dump(output, file)
        file.write('\n')
        
    # append to notebook file
    long_date.append(output)

In [None]:
pd.DataFrame(long_date).to_csv("data/chatgpt_labels/long_date.csv", index=False)

In [None]:
print(full_prompt)

In [None]:
# short definition, with date

In [None]:
short_date = []
for sq, date in tqdm(zip(merged_dates.X_test.tolist(),merged_dates.date_strings.tolist()), miniters=1):
    # prompt
    prompt = f'Is the search query "{sq}" political or news-related when searched in {date} in the Netherlands (yes or no)? Political or news-related search queries are defined as seeking information contributing to opinion formation on political and societal topics. Give your reasoning.'
    
    # predict
    output = gpt_labels_delayed(sq, date, delay, prompt)
    
    # dump search term + prediction dict to document
    with open('data/chatgpt_labels/short_date.json', 'a') as file: # appending to existing content
        json.dump(output, file)
        file.write('\n')
        
    # append to notebook file
    short_date.append(output)

In [None]:
print(prompt)

In [None]:
pd.DataFrame(short_date).to_csv("data/chatgpt_labels/short_date.csv", index=False)