# Pipeline for labeling

Very first step: Processing dataframe: Decide what columns to keep.

- Provide the DataFrame (do not include it in the function). Name the testing using the date format (MM/DD). Provide the directory for storing the data.
- Provide background knowledge for the system and prompt for the user.
- Record all responses and add each response to the "chatgpt_output" column for each data entry in the original DataFrame after the "TYPE" column.
- Save this DataFrame locally, naming it "MM/DD" + testing number (1 if only one testing is done) + "raw".
- Extract the number for each category and transpose each category to columns. Create the "chatgpt_label" column with the highest value among the categories. Then, create a "consistency" column (T/F) by comparing the values in the "TYPE" and "chatgpt_label" columns.
- Check for any missing values in the "chatgpt_label" column or any other transposed columns. Print the result.
- Save this processed DataFrame locally, naming it "MM/DD" + testing number (1 if only one testing is done) + "clean".
- Calculate the performance for each category and print it.

In [2]:
import pandas as pd
import re
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, f1_score
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUROPENAIKEY"))

In [None]:
def label_data(df, directory, background_knowledge, prompt_template, index):
    """
    Processes a DataFrame by labeling social media profile data using the OpenAI API. It then stores the results
    in specified directory files, computes and prints performance metrics, and handles any missing data in the output.

    Args:
        df (pd.DataFrame): A DataFrame containing the social media profile data that needs to be categorized.
                           Each row should represent one profile or post with relevant metadata.
        directory (str): The path to the directory where the output files will be saved. It should be writable.
        background_knowledge (str): A string containing essential background information that the AI model should
                                    use to understand the context of the data it will classify.
        prompt_template (str): A template string for prompts that will be sent to the OpenAI API. It must include
                               placeholders that will be filled with data from the DataFrame.

    Returns:
        None: This function directly modifies the input DataFrame and writes to files, but does not return any value.

    Raises:
        IOError: If there are issues with writing files to the specified directory.
        ValueError: If there are missing or improperly formatted data in the DataFrame that prevent the function from executing.
    """

    if 'chatgpt_output' in df.columns:
        df.drop(columns=['chatgpt_output'], inplace=True)

    responses = []

    for idx, row in df.iterrows():
        formatted_entry = "\n".join(f"{column}: {value}" for column, value in row.items() if column != 'TYPE')
        prompt = prompt_template.format(entry=formatted_entry)

        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": background_knowledge},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=100,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        chatgpt_output = response.choices[0].message.content
        responses.append(chatgpt_output)
        df.loc[idx, 'chatgpt_output'] = chatgpt_output

    today = datetime.now().strftime("%m%d")
    responses_filename = os.path.join(directory, f"{today}_{index}_responses.txt")
    with open(responses_filename, "w") as f:
        f.write("\n".join(responses))

    raw_filename = f"{directory}/{today}_{index}_raw.csv" ###
    df.to_csv(raw_filename, index=False)

    # Process responses
    categories = ['academia', 'layperson', 'activist', 'politician', 'think tank', 'social media influencer', 'news organizations and news workers'] # CHANGE HERE IF WE HAVE NEW CATEGORIES
    df = pd.concat([df, df['chatgpt_output'].apply(lambda x: extract_confidence(x, categories))], axis=1)

    df['chatgpt_label'] = df[categories].idxmax(axis=1)
    df['consistency'] = df['TYPE'] == df['chatgpt_label']
    clean_filename = f"{directory}/{today}_{index}_clean.csv" ###
    df.to_csv(clean_filename, index=False)

    # Check for missing values
    missing_data = df[categories + ['chatgpt_label']].isnull().any()
    print("Missing values in each category:", missing_data)

    # Calculate performance
    precision = precision_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)
    recall = recall_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)
    f1 = f1_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)

    results_df = pd.DataFrame({
        'Category': categories,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    print(results_df)

def extract_confidence(text, categories):
    extracted_values = {}
    for category in categories:
        pattern = rf"{category}: (\d+)%?"
        match = re.search(pattern, text, re.IGNORECASE)
        extracted_values[category] = int(match.group(1)) if match else None
    return pd.Series(extracted_values)

def re_evaluate_performance(file_path):
    """
    Reloads a processed DataFrame and computes precision, recall, and F1 score for the classification categories.

    Args:
    file_path (str): Path to the CSV file containing the processed DataFrame.

    Returns:
    pd.DataFrame: A DataFrame containing the performance metrics.
    """
    # Load the DataFrame
    df = pd.read_csv(file_path)

    # Ensure 'TYPE' and 'chatgpt_label' columns are present for calculation
    if 'TYPE' not in df.columns or 'chatgpt_label' not in df.columns:
        raise ValueError("DataFrame must contain 'TYPE' and 'chatgpt_label' columns.")

    # Define the categories
    categories = ['academia', 'layperson', 'activist', 'politician', 'think tank', 'social media influencer', 'news organizations and news workers'] # CHANGE HERE IF ANY

    # Check if all category columns exist
    missing_columns = [col for col in categories if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns for categories: {missing_columns}")

    # Calculate performance metrics
    precision = precision_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)
    recall = recall_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)
    f1 = f1_score(df['TYPE'], df['chatgpt_label'], labels=categories, average=None)

    # Prepare the results DataFrame
    results_df = pd.DataFrame({
        'Category': categories,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    # Optionally, print or return the DataFrame
    print(results_df)
    return df

def print_data(df, background_knowledge, prompt_template):

    if 'chatgpt_output' in df.columns:
        df.drop(columns=['chatgpt_output'], inplace=True)

    for idx, row in df.iterrows():
        formatted_entry = "\n".join(f"{column}: {value}" for column, value in row.items() if column != 'TYPE')
        prompt = prompt_template.format(entry=formatted_entry)
        print(background_knowledge + prompt)


In [None]:
# Example of labelling samples.

# number of testing
index = 1

# Directory where the labeled data will be stored
output_directory = "data"

# Background knowledge
background_knowledge = """
You are a social media expert. 
"""

# Template for the prompts 
prompt_template = """
""
Here is a dictionary of a series of variables with their meaning, learn then first and use them in the following categorization of a tweet account:

Here is a data entry of a twitter post. Recall the previous dictionary then classify the tweet account into [“academia”, “activist”, “layperson”, “politician", “think tank”, “social media influencer”, “news organizations and news workers”]. Note one account may fulfill multiple categories and only return your confidence level (from 0-100) for each category. You have to decide which category  is more dominant if there is a tie.

Note one account may fulfill multiple categories and only return your confidence level (from 0-100) for each category. Your response must be strictly formatted as "category: %", such as "academia: 50%".

Entry:
{entry}
""" 

label_data(df_clean_tk, output_directory, background_knowledge, prompt_template, index)

print_data(df_clean_smi, background_knowledge, prompt_template)