# Pipeline for labeling

Very first step: Processing dataframe: Decide what columns to keep.

- Provide the DataFrame (do not include it in the function). Name the testing using the date format (MM/DD). Provide the directory for storing the data.
- Provide background knowledge for the system and prompt for the user.
- Record all responses and add each response to the "chatgpt_output" column for each data entry in the original DataFrame after the "TYPE" column.
- Save this DataFrame locally, naming it "MM/DD" + testing number (1 if only one testing is done) + "raw".
- Extract the number for each category and transpose each category to columns. Create the "chatgpt_label" column with the highest value among the categories. Then, create a "consistency" column (T/F) by comparing the values in the "TYPE" and "chatgpt_label" columns.
- Check for any missing values in the "chatgpt_label" column or any other transposed columns. Print the result.
- Save this processed DataFrame locally, naming it "MM/DD" + testing number (1 if only one testing is done) + "clean".
- Calculate the performance for each category and print it.

In [None]:
import pandas as pd
import re
import numpy as np
from datetime import datetime
from openai import OpenAI
import os
import ast

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY"))

In [None]:
data = pd.read_csv("unique_authors_latest.csv")
data.shape

In [14]:
def label_data(df, directory, background_knowledge, prompt_template, index):

    if 'chatgpt_output' in df.columns:
        df.drop(columns=['chatgpt_output'], inplace=True)

    responses = []
    log_filename = os.path.join(directory, "process_log.txt")

    for idx, row in df.iterrows():
        formatted_entry = "\n".join(f"{column}: {value}" for column, value in row.items())
        prompt = prompt_template.format(entry=formatted_entry)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": background_knowledge},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=100,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        chatgpt_output = response.choices[0].message.content
        responses.append(chatgpt_output)
        df.loc[idx, 'chatgpt_output'] = chatgpt_output

        # Backup dataframe every 100 requests
        if (idx + 1) % 100 == 0:
            today = datetime.now().strftime("%m%d")
            backup_filename = f"{directory}/{today}_{index}_backup_{idx + 1}.csv"
            df.to_csv(backup_filename, index=False)

            # Update log file
            with open(log_filename, "a") as log_file:
                log_file.write(f"Processed {idx + 1} entries as of {datetime.now()}\n")

    today = datetime.now().strftime("%m%d")
    responses_filename = os.path.join(directory, f"{today}_{index}_responses.txt")
    with open(responses_filename, "w") as f:
        f.write("\n".join(responses))

    raw_filename = f"{directory}/{today}_{index}_raw.csv" ###
    df.to_csv(raw_filename, index=False)

    # Process responses
    categories = ['news organization and news worker', 'academia', 'political elite', 'others']
    df = pd.concat([df, df['chatgpt_output'].apply(lambda x: extract_confidence(x, categories))], axis=1)

    df['chatgpt_label'] = df[categories].idxmax(axis=1)
    clean_filename = f"{directory}/{today}_{index}_clean.csv" ###
    df.to_csv(clean_filename, index=False)

    # Final log update
    with open(log_filename, "a") as log_file:
        log_file.write(f"Labeling complete. Processed a total of {len(df)} entries as of {datetime.now()}\n")

    return df

def extract_confidence(text, categories):
    extracted_values = {}
    for category in categories:
        pattern = rf"{category}: (\d+)%?"
        match = re.search(pattern, text, re.IGNORECASE)
        extracted_values[category] = int(match.group(1)) if match else None
    return pd.Series(extracted_values)

In [19]:
file_path = 'data/unique_authors_latest.csv'

df = pd.read_csv(file_path)

columns_to_keep = ['author.verified', 'author.public_metrics.followers_count',
                   'author.username', 'author.name', 'author.created_at',
                   'author.public_metrics.following_count',
                   'author.public_metrics.tweet_count',
                   'author.public_metrics.listed_count', 'author.description',
                   'author.profile_image_url', 'author.entities.url.urls']

df_clean = df[columns_to_keep]

default_profile_url = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png"

df_clean['profile_image_type'] = df_clean['author.profile_image_url'].apply(
    lambda url: 0 if url == default_profile_url else 1
)

def extract_expanded_url(urls):
    if pd.isna(urls):  # Check if NaN
        return np.nan
    else:
        # Convert string representation of list to list
        urls_list = ast.literal_eval(urls)
        # Assuming the first dictionary in the list contains the 'expanded_url' key
        return urls_list[0].get('expanded_url', np.nan)

df_clean['author.expanded_url'] = df_clean['author.entities.url.urls'].apply(extract_expanded_url)

# Exclude accounts with shots training
usernames_to_exclude = ['anders_aslund', 'BeckyCNN', 'AaronArdoin2', 'olex_scherba', 'acgrayling', 'ABC7','kiraincongress', 'FukuyamaFrancis', 'KatrinaNation', 'EricMMatheny', 'AtlanticCouncil', 'ThreshedThought']
df_clean = df_clean[~df_clean['author.username'].isin(usernames_to_exclude)]
df_clean = df_clean.replace({True: 1, False: 0})

  df = pd.read_csv(file_path)
  df_clean = df_clean.replace({True: 1, False: 0})


In [None]:
# Filter the dataframe to get the entries with the specified usernames
# Define the list of usernames to filter out
# usernames_to_exclude = ['anders_aslund', 'BeckyCNN', 'AaronArdoin2', 'olex_scherba', 'acgrayling', 'ABC7','kiraincongress', 'EricMMatheny']
# usernames_to_filter = ['anders_aslund', 'BeckyCNN', 'AaronArdoin2', 'olex_scherba', 'acgrayling', 'ABC7','kiraincongress', 'FukuyamaFrancis', 'KatrinaNation', 'EricMMatheny', 'AtlanticCouncil', 'ThreshedThought']


In [20]:
# 2. Example of labelling samples with sampling trails(09/09).

# number of testing
index = 0

# Directory where the labeled data will be stored
output_directory = "data/test"

# Background knowledge
background_knowledge = """
You are a social media expert. 
"""

# Template for the prompts 
prompt_template = """
""
Here is a dictionary of variables with their meanings of a twitter account information. Learn them first and use them in the following categorization of a tweet account:
author.verified: Whether or not the account is verified.
author.username: The Twitter screen name, handle, or alias that this user identifies themselves with. Usernames are unique but subject to change. Typically a maximum of 15 characters long, but some historical accounts may exist with longer names. (on the website, it is seen with the "@" symbol).
author.name: Profile name. NOT the handle which contains "@".
author.created_at: Date and time at which the author account was registered.
author.public_metrics.followers_count: Number of accounts following the author account.
author.public_metrics.following_count: Number of accounts the author is following.
author.public_metrics.tweet_count: Number of tweets an author has published including retweets.
author.public_metrics.listed_count: The number of public lists that this user is a member of.
author.description: User-added description found below the username on the profile screen.
profile_image_type: if this account used customized profile image instead of default setting. 
author.expanded_url: The URL specified in the user's profile, if present. A URL provided by a Twitter user in their profile (not the URLs in the profile description text).

Here is a data entry of a twitter post. Recall the previous dictionary then classify the tweet account into ['news organization and news worker', 'academia', 'political elite', 'others']. Only return your confidence level from 0 (not likely) to 100% (very likely) for each category. You have to decide which category is more dominant if there is a tie. You can refer the definitions of each category here:
news organization and news worker: A “news organization and news worker” user account is operated by an individual or organizational entity involved in news reporting, media production, or journalism-related practices. These accounts focus on disseminating news, providing commentary, and engaging in journalistic activities.
academia: an “academia” user account is operated by an individual or institutional entity affiliated with an academic institution, such as a university or research center. These accounts primarily engage in scholarly communication, research dissemination, academic networking, and public outreach.
political elite: A “political elite” user account is operated by an individual who holds or is running for a public office, engages in political activities, or represents governmental bodies, military, or defense organizations. These accounts focus on political communication, campaigning, and public engagement on policy issues.
others: This category includes accounts not classified as “News Organization and News Worker,” “Academia,” or “Political Elite.” It encompasses average social media users, influencers,  podcasters, industry practitioners, and other types of users.

Here are 3 examples for each category:
academia: [author.verified: 1
author.public_metrics.followers_count: 308905
author.username: anders_aslund
author.name: Anders Åslund
author.created_at: 2014-12-19T19:15:17.000Z
author.public_metrics.following_count: 4215
author.public_metrics.tweet_count: 40711
author.public_metrics.listed_count: 4432
author.description: Economist & author. Russia, Ukraine & Eastern Europe. Read my latest book: "Russia's Crony Capitalism" https://t.co/ZqmWMRSMf9
profile_image_type: 1
author.expanded_url: nan

author.verified: 0
author.public_metrics.followers_count: 148086
author.username: acgrayling
author.name: A C Grayling #FBPE #Reform #Rejoin #FBPR
author.created_at: 2009-06-05T08:15:09.000Z
author.public_metrics.following_count: 24455
author.public_metrics.tweet_count: 169895
author.public_metrics.listed_count: 1233
author.description: Philosopher, Author @NortheasternLdn
profile_image_type: 1
author.expanded_url: http://acgrayling.com

author.verified: 1
author.public_metrics.followers_count: 252559
author.username: FukuyamaFrancis
author.name: Francis Fukuyama
author.created_at: 2011-12-06T23:09:59.000Z
author.public_metrics.following_count: 728
author.public_metrics.tweet_count: 7919
author.public_metrics.listed_count: 2926
author.description: Senior Fellow at Stanford's Freeman Spogli Institute, Director, Ford Dorsey Masters in Intl Policy @StanfordCDDRL Instagram: francis.fukuyama
profile_image_type: 1
author.expanded_url: http://francisfukuyama.com
]

news organization and news worker:[
author.verified: 1
author.public_metrics.followers_count: 138450
author.username: BeckyCNN
author.name: Becky Anderson
author.created_at: 2010-03-04T16:06:24.000Z
author.public_metrics.following_count: 3696
author.public_metrics.tweet_count: 5977
author.public_metrics.listed_count: 905
author.description: Host of @CNNConnect & CNN Abu Dhabi Managing Editor. Join us Mon-Fri for our Emmy-award winning show @CNNi 19:00 UAE | 15:00 London | 10:00 New York
profile_image_type: 1
author.expanded_url: http://www.CNN.com/Connect

author.verified: 1
author.public_metrics.followers_count: 1254946
author.username: ABC7
author.name: ABC7 Eyewitness News
author.created_at: 2008-09-20T04:22:30.000Z
author.public_metrics.following_count: 496
author.public_metrics.tweet_count: 269800
author.public_metrics.listed_count: 6344
author.description: Your #1 breaking news and local news source in Southern California and the greater Los Angeles area. When you witness news, share it with #abc7eyewitness.
profile_image_type: 1
author.expanded_url: http://abc7.com

author.verified: 1
author.public_metrics.followers_count: 178211
author.username: KatrinaNation
author.name: Katrina vandenHeuvel
author.created_at: 2009-07-22T15:47:11.000Z
author.public_metrics.following_count: 44480
author.public_metrics.tweet_count: 108324
author.public_metrics.listed_count: 5241
author.description: The Nation's Editorial Director and Publisher
profile_image_type: 1
author.expanded_url: http://www.thenation.com
]

political elite:[
author.verified: 1
author.public_metrics.followers_count: 264358
author.username: olex_scherba
author.name: olexander scherba🇺🇦
author.created_at: 2010-07-05T09:44:27.000Z
author.public_metrics.following_count: 1178
author.public_metrics.tweet_count: 34418
author.public_metrics.listed_count: 4172
author.description: 26 years in diplomacy. 🇺🇦 Ambassador to Austria (2014-2021). Author of “Ukraine vs Darkness. Undiplomatic Thoughts”. Tweets in 🇬🇧,🇩🇪,🇺🇦.
profile_image_type: 1
author.expanded_url: https://www.amazon.com/Ukraine-vs-Darkness-Undiplomatic-Ukrainian-ebook/dp/B08WKF3B2K

author.verified: 1
author.public_metrics.followers_count: 190182
author.username: kiraincongress
author.name: Kira Rudik
author.created_at: 2019-09-03T07:51:48.000Z
author.public_metrics.following_count: 318
author.public_metrics.tweet_count: 3381
author.public_metrics.listed_count: 1758
author.description: Member of Ukrainian Parliament, Leader of Political party Golos, Vice President of ALDE. #warinukraine #ukraine #ukrainetoday #congresswoman
profile_image_type: 1
author.expanded_url: https://kirarudik.com

author.verified: 1
author.public_metrics.followers_count: 147035
author.username: ThreshedThought
author.name: Mike Martin 🔶
author.created_at: 2011-04-26T14:43:44.000Z
author.public_metrics.following_count: 2364
author.public_metrics.tweet_count: 16747
author.public_metrics.listed_count: 2871
author.description: Parliamentary Candidate for Tunbridge Wells @libdems; Senior Visiting Fellow @warstudies (https://t.co/Tb71X8Cl7t); pre-order book https://t.co/S98EBf3vvN
profile_image_type: 1
author.expanded_url: https://www.mike-martin.co.uk
]

others:[
author.verified: 0
author.public_metrics.followers_count: 590
author.username: AaronArdoin2
author.name: 🇺🇲🗽Aaron Ardoin♿️🇺🇸
author.created_at: 2021-09-20T01:33:18.000Z
author.public_metrics.following_count: 1220
author.public_metrics.tweet_count: 12990
author.public_metrics.listed_count: 0
author.description: Digital Soldier, Wounded Warrior, Freedom-loving Red-blooded Patriot,
Cannabis Connoisseur,  Former Grower, Daily Smoker
profile_image_type: 1
author.expanded_url: nan

author.verified: 0
author.public_metrics.followers_count: 298749
author.username: EricMMatheny
author.name: Eric Matheny 🎙
author.created_at: 2014-08-13T20:18:03.000Z
author.public_metrics.following_count: 146020
author.public_metrics.tweet_count: 221230
author.public_metrics.listed_count: 461
author.description: Attorney • Podcaster • Social Commentator • Cohost of “Bob & Eric Save America” • iTunes: https://t.co/Bvfie93Fnt | Google: https://t.co/Ly748Jbv54
profile_image_type: 1
author.expanded_url: http://Patreon.com/BobAndEric

author.verified: 1
author.public_metrics.followers_count: 194082
author.username: AtlanticCouncil
author.name: Atlantic Council
author.created_at: 2008-12-20T15:34:07.000Z
author.public_metrics.following_count: 2637
author.public_metrics.tweet_count: 71750
author.public_metrics.listed_count: 3496
author.description: Shaping the global future together
profile_image_type: 1
author.expanded_url: https://atlanticcouncil.org/
]

Your response must be strictly formatted as "category: %", such as "academia: 50%".

Entry:
{entry}
""" 

In [21]:
# Split the dataframe into 5 sets
split_dataframes = np.array_split(df_clean, 5)
for i, split_df in enumerate(split_dataframes):
    df_labeled = label_data(split_df, output_directory, background_knowledge, prompt_template, i)
    output_file = f"{output_directory}/labeled_dataset_part_{i + 1}.csv"
    df_labeled.to_csv(output_file, index=False)
    print(f"Labeling complete for part {i + 1}. Labeled dataset saved.")

  return bound(*args, **kwds)


Labeling complete for part 1. Labeled dataset saved.
Labeling complete for part 2. Labeled dataset saved.
Labeling complete for part 3. Labeled dataset saved.
Labeling complete for part 4. Labeled dataset saved.
Labeling complete for part 5. Labeled dataset saved.


In [None]:
# 2. Example of labelling samples (08/04).

# number of testing
index = 0

# Directory where the labeled data will be stored
output_directory = "data"

# Background knowledge
background_knowledge = """
You are a social media expert. 
"""

# Template for the prompts 
prompt_template = """
""
Here is a dictionary of variables with their meanings of a twitter account information. Learn them first and use them in the following categorization of a tweet account:
author.verified: Whether or not the account is verified.
author.username: The Twitter screen name, handle, or alias that this user identifies themselves with. Usernames are unique but subject to change. Typically a maximum of 15 characters long, but some historical accounts may exist with longer names. (on the website, it is seen with the "@" symbol).
author.name: Profile name. NOT the handle which contains "@".
author.created_at: Date and time at which the author account was registered.
author.public_metrics.followers_count: Number of accounts following the author account.
author.public_metrics.following_count: Number of accounts the author is following.
author.public_metrics.tweet_count: Number of tweets an author has published including retweets.
author.public_metrics.listed_count: The number of public lists that this user is a member of.
author.description: User-added description found below the username on the profile screen.
profile_image_type: if this account used customized profile image instead of default setting. 
author.expanded_url: The URL specified in the user's profile, if present. A URL provided by a Twitter user in their profile (not the URLs in the profile description text).

Here is a data entry of a twitter post. Recall the previous dictionary then classify the tweet account into ['news organization and news worker', 'academia', 'political elite', 'others']. Only return your confidence level from 0 (not likely) to 100% (very likely) for each category. You have to decide which category is more dominant if there is a tie. You can refer the definitions of each category here:
news organization and news worker: A “news organization and news worker” user account is operated by an individual or organizational entity involved in news reporting, media production, or journalism-related practices. These accounts focus on disseminating news, providing commentary, and engaging in journalistic activities.
academia: an “academia” user account is operated by an individual or institutional entity affiliated with an academic institution, such as a university or research center. These accounts primarily engage in scholarly communication, research dissemination, academic networking, and public outreach.
political elite: A “political elite” user account is operated by an individual who holds or is running for a public office, engages in political activities, or represents governmental bodies, military, or defense organizations. These accounts focus on political communication, campaigning, and public engagement on policy issues.
others: This category includes accounts not classified as “News Organization and News Worker,” “Academia,” or “Political Elite.” It encompasses average social media users, influencers,  podcasters, industry practitioners, and other types of users.

Here are examples for all the categories:
Academia: [
author.verified: 1
author.public_metrics.followers_count: 308905
author.username: anders_aslund
author.name: Anders Åslund
author.created_at: 2014-12-19T19:15:17.000Z
author.public_metrics.following_count: 4215
author.public_metrics.tweet_count: 40711
author.public_metrics.listed_count: 4432
author.description: Economist & author. Russia, Ukraine & Eastern Europe. Read my latest book: "Russia's Crony Capitalism" https://t.co/ZqmWMRSMf9
profile_image_type: 1
author.expanded_url: nan
]

news organization and news worker:[
author.verified: 1
author.public_metrics.followers_count: 138450
author.username: BeckyCNN
author.name: Becky Anderson
author.created_at: 2010-03-04T16:06:24.000Z
author.public_metrics.following_count: 3696
author.public_metrics.tweet_count: 5977
author.public_metrics.listed_count: 905
author.description: Host of @CNNConnect & CNN Abu Dhabi Managing Editor. Join us Mon-Fri for our Emmy-award winning show @CNNi 19:00 UAE | 15:00 London | 10:00 New York
profile_image_type: 1
author.expanded_url: http://www.CNN.com/Connect
]

political elite:[
author.verified: 1
author.public_metrics.followers_count: 264358
author.username: olex_scherba
author.name: olexander scherba🇺🇦
author.created_at: 2010-07-05T09:44:27.000Z
author.public_metrics.following_count: 1178
author.public_metrics.tweet_count: 34418
author.public_metrics.listed_count: 4172
author.description: 26 years in diplomacy. 🇺🇦 Ambassador to Austria (2014-2021). Author of “Ukraine vs Darkness. Undiplomatic Thoughts”. Tweets in 🇬🇧,🇩🇪,🇺🇦.
profile_image_type: 1
author.expanded_url: https://www.amazon.com/Ukraine-vs-Darkness-Undiplomatic-Ukrainian-ebook/dp/B08WKF3B2K
]

others:[
author.verified: 0
author.public_metrics.followers_count: 590
author.username: AaronArdoin2
author.name: 🇺🇲🗽Aaron Ardoin♿️🇺🇸
author.created_at: 2021-09-20T01:33:18.000Z
author.public_metrics.following_count: 1220
author.public_metrics.tweet_count: 12990
author.public_metrics.listed_count: 0
author.description: Digital Soldier, Wounded Warrior, Freedom-loving Red-blooded Patriot,
Cannabis Connoisseur,  Former Grower, Daily Smoker
profile_image_type: 1
author.expanded_url: nan
]

Your response must be strictly formatted as "category: %", such as "academia: 50%".

Entry:
{entry}
""" 

label_data(df_clean_filtered, output_directory, background_knowledge, prompt_template, index)

#print_data(df_clean_smi, background_knowledge, prompt_template)