# Setup

In [None]:
import pandas as pd
import json
import ast
from openai import OpenAI

# Extract separate questions and answers

In [None]:
# Specify personal OpenAI key
client = OpenAI(
  api_key=""
)

In [None]:
# Function to extract questions from text possibly containing multiple questions
def extract_questions_answers(questions, answers):
  # Prompt to instruct the model
  prompt = f"""
  Extract each distinct question from the following question transcript.
  Keep multi-sentence questions grouped together so that the supporting context remains intact.

  Then extract the answers given to the extracted questions from the following answer transcript.
  Keep multi-sentence answers grouped together so that the supporting context remains intact.
  If a question is not answered, state 'NOT ANSWERED'.

  For each [Question, Answer] pair, also assign a numerical score between 0 and 1 with 1 decimal for how directly the answer addresses the question where a score of 1 means the question was fully answered and a score of 0 means the question was fully avoided.

  Return your response in **strict Python list format**:

  [
    ['question 1', 'answer 1', answer_score],
    ['question 2', 'answer 2', answer_score],
    ...
  ]

  Question transcript:
  {questions}

  Answer transcript:
  {answers}
  """

  # Call Open AI model
  response = client.chat.completions.create(
      model="gpt-5-mini",
      messages=[
          {"role": "system", "content": "You are an assistant that extracts and matches questions and answers from Q&A transcripts, and checks to which extent the question was answered or avoided."},
          {"role": "user", "content": prompt}
      ]
  )
  return response.choices[0].message.content

In [None]:
# Function to check question orientation and theme
def check_question_properties(question):
  prompt = f"""
  Classify the following question along two dimensions:

  1. Orientation:
      - 'Past' → asking about past or current performance
      - 'Future' → asking about guidance, expectations, or outlook
      - 'Mixed' → contains both past and future elements

  2. Theme:
      Choose one from the following:
      - 'Profitability'
      - 'Capital & Liquidity'
      - 'Macro & Geopolitical influences'
      - 'Regulatory & Legal'
      - 'Risk management'
      - 'Technology & Innovation'
      - 'Sustainability'
      - 'Strategy & Management'
      - 'Other' (if none apply)

  Return your response in **strict Python list format**:
  ['orientation', 'theme']

  Question:
  {question}
  """

  # Call Open AI model
  response = client.chat.completions.create(
      model="gpt-5-nano",
      messages=[
          {"role": "system", "content": "You are an assistant that classifies financial Q&A questions."},
          {"role": "user", "content": prompt}
      ]
  )
  return response.choices[0].message.content

In [None]:
#import data
all_qas = pd.read_csv("citi_Q&A_blocks_2013_2025_FINAL.csv")

In [None]:
# Filter for data in batches
filtered_qas = all_qas[all_qas["year"] <= 2023]
filtered_qas = filtered_qas[filtered_qas["year"] > 2022]

qa_pairs = filtered_qas

print(qa_pairs)

      bank  year quarter  tag question_speaker  analyst_firm  \
1888  Citi  2023      Q1  NaN     Glenn Schorr           NaN   
1889  Citi  2023      Q1  NaN     Glenn Schorr           NaN   
1890  Citi  2023      Q1  NaN        Mike Mayo           NaN   
1891  Citi  2023      Q1  NaN        Mike Mayo           NaN   
1892  Citi  2023      Q1  NaN    Betsy Graseck           NaN   
...    ...   ...     ...  ...              ...           ...   
1969  Citi  2023      Q4   FY     Vivek Juneja           NaN   
1970  Citi  2023      Q4   FY     Vivek Juneja           NaN   
1971  Citi  2023      Q4   FY    Steven Chubak           NaN   
1972  Citi  2023      Q4   FY    Steven Chubak           NaN   
1973  Citi  2023      Q4   FY        Mike Mayo           NaN   

                                               question  \
1888  Hi, thank you, a simple one. I appreciate the ...   
1889  I appreciate that. Maybe if I could follow-up ...   
1890  Hi, Jane, I challenged you a couple earnings c..

In [None]:
# Extract all questions and answers
result_list = []

# For all extracted pairs
for i, pair in enumerate(qa_pairs.iterrows()): # Iterate over DataFrame rows
  # pair is a tuple of (index, Series)
  question_text = pair[1]['question'] # Access 'question' column from the Series
  answer_text = pair[1]['answer']   # Access 'answer' column from the Series

  LLM_output_str = extract_questions_answers(question_text, answer_text)

  # Delete newlines and spaces
  LLM_output_str = " ".join(LLM_output_str.splitlines()).strip()

  try:
    # Safely evaluate the string to a Python list
    LLM_output_list = ast.literal_eval(LLM_output_str)

    # Check if the evaluated output is a list and process it
    if isinstance(LLM_output_list, list):
      for item in LLM_output_list:
        # Check if each item is a list with at least three elements
        if isinstance(item, list) and len(item) >= 3:
            result_list.append({"bank": pair[1]['bank'],# Access other columns from the Series
                                "year": pair[1]['year'],
                                "quarter": pair[1]['quarter'],
                                "tag": pair[1]['tag'],
                                "question_speaker": pair[1]['question_speaker'],
                                "analyst_firm": pair[1]['analyst_firm'],
                                "extracted_question": item[0],
                                "original_question_text": question_text,
                                "answer_speaker": pair[1]['answer_speaker'],
                                "extracted_answer": item[1],
                                "original_answer_text": answer_text,
                                "answer_score": item[2]})

        else:
          print(f"Skipping invalid item in LLM output: {item}")
      print(f"Processed block: {i+1} of {len(qa_pairs)}, extracted {len(LLM_output_list)} questions and answers.")

      if len(LLM_output_list) == 0:
        print(f"No questions or answers found for following texts:")
        print(question_text)
        print(answer_text)

    else:
      print(f"LLM output is not a list: {LLM_output_str}")

  except (ValueError, SyntaxError) as e:
    print(f"Could not parse LLM output string: {LLM_output_str} - Error: {e}")

# Create DataFrame from the list of results
result_df = pd.DataFrame(result_list)

# Save as csv
result_df.to_csv("qna_data.csv", index=False, sep=";")

Processed block: 1 of 86, extracted 1 questions and answers.
Processed block: 2 of 86, extracted 1 questions and answers.
Processed block: 3 of 86, extracted 1 questions and answers.
Processed block: 4 of 86, extracted 3 questions and answers.
Processed block: 5 of 86, extracted 1 questions and answers.
Processed block: 6 of 86, extracted 1 questions and answers.
Processed block: 7 of 86, extracted 1 questions and answers.
Processed block: 8 of 86, extracted 1 questions and answers.
Processed block: 9 of 86, extracted 1 questions and answers.
Processed block: 10 of 86, extracted 1 questions and answers.
Processed block: 11 of 86, extracted 2 questions and answers.
Processed block: 12 of 86, extracted 1 questions and answers.
Processed block: 13 of 86, extracted 1 questions and answers.
Processed block: 14 of 86, extracted 2 questions and answers.
Processed block: 15 of 86, extracted 1 questions and answers.
Processed block: 16 of 86, extracted 1 questions and answers.
Processed block: 

In [None]:
# Check the properties of every question in the data and store
for idx, row in result_df.iterrows():
  question_properties = check_question_properties(row["extracted_question"])

  try:
    question_properties_list = ast.literal_eval(question_properties)

    if isinstance(question_properties_list, list) and len(question_properties_list) == 2:
        result_df.loc[idx, "orientation"] = question_properties_list[0]
        result_df.loc[idx, "theme"] = question_properties_list[1]
        print(f"Processed question {idx+1}")
    else:
        print(f"Skipping invalid item in LLM output: {question_properties_list}")

  except (ValueError, SyntaxError) as e:
    print(f"Could not parse LLM output string: {question_properties} - Error: {e}")

# Save to csv
result_df.to_csv("qna_data.csv", index=False, sep=";")

Processed question 1
Processed question 2
Processed question 3
Processed question 4
Processed question 5
Processed question 6
Processed question 7
Processed question 8
Processed question 9
Processed question 10
Processed question 11
Processed question 12
Processed question 13
Processed question 14
Processed question 15
Processed question 16
Processed question 17
Processed question 18
Processed question 19
Processed question 20
Processed question 21
Processed question 22
Processed question 23
Processed question 24
Processed question 25
Processed question 26
Processed question 27
Processed question 28
Processed question 29
Processed question 30
Processed question 31
Processed question 32
Processed question 33
Processed question 34
Processed question 35
Processed question 36
Processed question 37
Processed question 38
Processed question 39
Processed question 40
Processed question 41
Processed question 42
Processed question 43
Processed question 44
Processed question 45
Processed question 