# Function to communicate with OpenAI API

In [17]:
import os
import openai
import csv
import json
import pandas as pd

def get_openai_response(tweets, temperature=0.3, max_tokens=4000):
    # set api key, which is for large part stored in a text file
    with open('C:/openaiapikey.txt', 'r') as file:
        open_ai_api_key = file.readline().strip() + "9izgJtucT6"    

    # build system prompt
    system_prompt =  "I'm an AI trained to classify and rate the radical intent in social media posts.\n"
    system_prompt += "Given a set of tweets, I provide an analysis for each and rate the radical intent of the writer on a scale from 0 (no radical intent) to 100 (extreme radical intent).\n"
    system_prompt += "It's important to note that news reports, even about violent events, are not necessarily an indicator of radical intent of the writer.\n"
    system_prompt += "Here are some examples of how I do this:\n"
    
    system_prompt += "Example 1:\n"
    system_prompt += "Tweet 23: 'Kill all Americans, they are pigs and must all die'\n"
    system_prompt += "Analysis: Tweet 23 advocates violence and harm against a particular group of people, which is a key indicator of radical intent.\n"
    system_prompt += "Radicalness: 100\n"
    
    system_prompt += "Example 2:\n"
    system_prompt += "Tweet 48: 'я ненавижу всех европейцев'\n"
    system_prompt += "Analysis: Tweet 48 translates to 'I hate all Europeans'; it does not directly advocate for violence, but it does express a strong negative sentiment towards a broad group of people, which can be seen as a mild form of radical intent.\n"
    system_prompt += "Radicalness: 50\n"
    
    system_prompt += "Example 3:\n"
    system_prompt += "Tweet 88: 'breakingnews car bomb targeted military personnel near military barracks istanbul turkey pkk move twitterkurds'\n"
    system_prompt += "Analysis: Tweet 88 includes violent content, but its tone and presentation are similar to a news headline, and it seems to be meant to inform rather than to radicalize.\n"
    system_prompt += "Radicalness: 5\n"
    
    system_prompt += "Example 4:\n"
    system_prompt += "Tweet 832: 'isis claims responsibility ied attack west algeria resulted killing two soldiers wounding others'\n"
    system_prompt += "Analysis: Tweet 832 looks like a news report, indicating very low radical intent.\n"
    system_prompt += "Radicalness: 5\n"
    
    system_prompt += "Example 5:\n"
    system_prompt += "Tweet: 'rt amaqagency islamic state fighters advance deirezzor city captured regimes last pos sina ah last month'\n"
    system_prompt += "Analysis: Tweet 5 reports on the Islamic State's advances, similar to a news headline. Although it deals with violent events, it does not advocate for violence or express radical intent, but merely reports events.\n"
    system_prompt += "Radicalness: 10\n"

    system_prompt += "Example 6:\n"
    system_prompt += "Tweet 12365: 'al battar engl leading destruction entirely killing wounding crew may allah praised caliphate news'\n"
    system_prompt += "Analysis: Tweet 12365 advocates for the destruction and killing of a group of people, and praises the actions of a particular group, indicating a high level of radical intent.\n"
    system_prompt += "Radicalness: 90\n"

    system_prompt += "Example 7:\n"
    system_prompt += "Tweet 12366: 'trump kill muslims bullets dipped pig blood video https co'\n"
    system_prompt += "Analysis: Tweet 12366 advocates for violence against a specific religious group, using derogatory language, indicating a high level of radical intent.\n"
    system_prompt += "Radicalness: 95\n"
        
    # build user prompt
    user_prompt = "Now, analyze the " + str(len(tweets)) + " tweets below. Response with two lines per tweet, starting with 'Analysis:' and 'Radicalness:'.\n"
    for tweet in tweets:
        user_prompt += tweet + "\n"
    
    # build the message array for the request
    msg_array = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    # perform the request
    openai.api_key = open_ai_api_key
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=msg_array,
        max_tokens=max_tokens,
        temperature=temperature,
    )
    
    # process the response
    token_cnt = response['usage']['total_tokens']
    content = response['choices'][0]['message']['content'].replace("\n","")
    parts = content.split("Analysis:")
    return_data = []
    for part in parts:
        subparts = part.split("Radicalness:")
        if len(subparts) == 2:
            tweet_nr = subparts[0].strip().split(' ')[1].strip()
            tweet_analysis = " ".join(subparts[0].strip().split(' ')[2:])
            tweet_radicalness = subparts[1].strip()
            return_data.append([tweet_nr, tweet_radicalness, tweet_analysis])
    
    # return
    return return_data, token_cnt



# Main script

In [None]:
import os
import pandas as pd
import time

# set the maximum nr of tweets to process in this run (for testing)
max_cnt = 3305
batch_size = 10

# read the input .csv
filename_in = 'Twitter Group3.csv'
filename_out = 'twitter_group3.xlsx'
tweets_dict = {}
cnt = 0

# Check if the output file already exists
if os.path.exists(filename_out):
    df = pd.read_excel(filename_out)
    processed_tweets = df['tweet nr'].values.tolist()
    print(f"Skipping the first {len(processed_tweets)} tweets, because we already have results for them")
else:
    df = pd.DataFrame(columns=['rater', 'tweet nr', 'tweet', 'label', 'AI score', 'AI analysis'])
    processed_tweets = []

with open(filename_in, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        if cnt < max_cnt:
            tweet_number = int(row[''])
            if tweet_number not in processed_tweets:
                tweet = row['tweets']
                tweets_dict[tweet_number] = tweet
                cnt += 1

# print info about the upcoming processing
print(f"Going to process {len(tweets_dict)} tweets in batches of {batch_size}")

# set who is going to rate which rows (the numbers specify the first and last tweet nr for each rater)
rater_dict = {'eleazar' : ( 9920, 10581),
              'md abd'  : (10582, 11242),
              'md ari'  : (11243, 11903),
              'pramod'  : (11904, 12564),
              'ronald'  : (12565, 13225)}

# loop over the tweets and feed them to OpenAI
tweet_numbers = list(tweets_dict.keys())
token_sum = 0
for i in range(0, len(tweet_numbers), batch_size):
    # build list with the next batch of tweets (or all remaining ones if fewer than 20 left)
    tweet_subset = tweet_numbers[i:i+batch_size]
    tweets = [f"Tweet {tweet_number}: '{tweets_dict[tweet_number]}'" for tweet_number in tweet_subset]
    # get openai response
    results, token_cnt = get_openai_response(tweets, temperature=0.5, max_tokens=2000)
    # loop over the results and add them to the excel file    
    token_sum += int(token_cnt)
    for tweet_nr in tweet_subset:
        # find the AI result for this tweet in the list of results        
        result = [tup for tup in results if tup[0].isdigit() and int(tup[0]) == int(tweet_nr)]
        if result == []:
            result = [tweet_nr, -1, 'N/A']
        else:
            result = result[0]
        # make sure the score is numeric (convert things like 'N/A' to -1)
        try:
            result[1] = float(result[1])
        except ValueError:
            result[1] = -1
        # assign rater based on tweet number
        rater = next((r for r, (start, end) in rater_dict.items() if start <= tweet_nr <= end), 'unknown')
        # add a line to the DataFrame and save
        new_row = pd.DataFrame({'rater': [rater], 'tweet nr': [tweet_nr], 'tweet': [tweets_dict[tweet_nr]], 'label': [''], 'AI score': [result[1]], 'AI analysis': [result[2]]})
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(filename_out, index=False)
        # display tweets that were rated as high radicalness
        if int(result[1]) >= 65:
            print(f"\nT = {tweets_dict[tweet_nr]}")
            print(f"A = {result[2]}")
            print(f"R = {result[1]}")    
    print(f"=> processed {i+batch_size} tweets so far; total cost = {token_sum} tokens (${(token_sum/1000)*0.002:.2f})")
    time.sleep(10)

Skipping the first 2460 tweets, because we already have results for them
Going to process 846 tweets in batches of 10
=> processed 10 tweets so far; total cost = 1188 tokens ($0.00)
=> processed 20 tweets so far; total cost = 2354 tokens ($0.00)

T = isis amaqagency damascus isis fighters take control arubah checkpoint cursedavidcameron alkhayr https co
A = advocates for the control of a checkpoint by ISIS fighters and includes derogatory language, indicating a high level of radical intent.
R = 80.0
=> processed 30 tweets so far; total cost = 3529 tokens ($0.01)
=> processed 40 tweets so far; total cost = 4678 tokens ($0.01)
=> processed 50 tweets so far; total cost = 5917 tokens ($0.01)

T = rt afp murders force bangladeshi gay rights activists underground https co
A = reports on violence against a particular group of people, which is a key indicator of radical intent.
R = 80.0
=> processed 60 tweets so far; total cost = 7132 tokens ($0.01)
=> processed 70 tweets so far; total cost = 