# Setup and installs

In [None]:
!pip -q install pypdf nltk spacy scikit-learn matplotlib wordcloud
!pip -q install torch
!pip install transformers==4.49.0 # version needs to be pinned

In [None]:
import nltk, os, re, math, string, json
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

In [20]:
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoProcessor
import pandas as pd

# Load PDF

In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import requests
import pdfplumber

# URL for PDF
url = "https://home.barclays/content/dam/home-barclays/documents/investor-relations/ResultAnnouncements/H12025Results/Q225-Q&A-Transcript.pdf"
pdf_file = "barclays_q2_2025_qa.pdf"

# Download PDF
response = requests.get(url)
with open(pdf_file, "wb") as f:
    f.write(response.content)

# Read and extract
all_text = ""
with pdfplumber.open(pdf_file) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            all_text += text + "\n"

# Save to txt
with open("barclays_q2_2025_qa.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

# Extract separate questions and answers

In [4]:
# Regex to detect speaker names, with or without a comma, supporting hyphens, apostrophes, and initials
name_pattern = re.compile(
    r"""
    ^                                   # start of line
    (?:                                 # either initials OR normal first name
        (?:[A-Z]\.)+                    # one or more initials with dots, e.g., C.S.
        |                               # OR
        [A-Z][a-zA-Z'-]+                # normal first name
    )
    (?:\s+[A-Z][a-zA-Z'-]+){1,3}        # at least one last name, optional 1-2 extra names
    (?:,\s*.+)?                          # optional comma + organization/function
    (?<!\.)$                             # line must NOT end with a period
    """,
    re.VERBOSE
)

# Function to split questions and answers
def split_q_a(text):
    speaker_text_list = []
    current_speaker = None
    current_lines = []

    # Text for Barclays is formatted per line
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        if re.match(r'^\d+$', line):  # Page numbers
            continue
        if line.lower().startswith("investor relations"):
            continue

        # If line matches name pattern and is not more than 10 words
        if name_pattern.match(line) and len(line.split()) <= 10:
            # Save last speaker
            if current_speaker and current_lines:
                speaker_text_list.append((current_speaker, " ".join(current_lines)))
            current_speaker = line
            current_lines = []
        elif current_speaker:
            current_lines.append(line)

    # Add last speaker
    if current_speaker and current_lines:
        speaker_text_list.append((current_speaker, " ".join(current_lines)))

    return speaker_text_list

speaker_text_list = split_q_a(all_text)
for s, t in speaker_text_list[:10]:
    print(f"Speaker: {s}\nText: {t[:100]}...\n")

Speaker: Alvaro Serrano, Morgan Stanley
Text: Hi, good morning. A couple of questions, please. First of all, on capital and then one on the Invest...

Speaker: Anna Cross
Text: Good morning, Alvaro, thank you very much for kicking off the call for us. I'll take the first quest...

Speaker: C.S. Venkatakrishnan
Text: Yes. Thanks for the question, Alvaro. On the Investment Bank, let me give you a longer term view and...

Speaker: Anna Cross
Text: Thank you, Alvaro. Perhaps we can go to the next question, please....

Speaker: Guy Stebbings, BNP Paribas Exane
Text: Hi, morning there. Two questions, if I may. The first one was on Barclays UK, I was wondering if you...

Speaker: Anna Cross
Text: Thanks, Guy. Why don't I take both of those and I'm sure Venkat may add on the second. Let me start ...

Speaker: C.S. Venkatakrishnan
Text: Look, I'll just add at a broader macroeconomic level, it has been remarkable really, how resilient t...

Speaker: Anna Cross
Text: Okay, thank you, next questio

In [5]:
# Function to determine if speaker is an analyst or executive
def determine_analyst_exec(speaker_text_list, questionmark_threshold=0.8, top_turns_percent=0.3):

    speaker_questionmark_counts = {}
    speaker_stats = {}

    # Count ? per speaker per block
    for speaker, text in speaker_text_list:
        questionmark_count = text.count('?')
        speaker_questionmark_counts.setdefault(speaker, []).append(questionmark_count)

    # Compute average question marks and number of turns
    for speaker, counts in speaker_questionmark_counts.items():
        avg_questionmark_count = sum(counts) / len(counts)
        turns = len(counts)
        speaker_stats[speaker] = {"avg_qm": avg_questionmark_count, "turns": turns}

    # Determine turn threshold for top N%
    all_turns = sorted([stats["turns"] for stats in speaker_stats.values()], reverse=True)
    top_n_index = int(len(all_turns) * top_turns_percent)
    if top_n_index == 0:
        top_turns_threshold = max(all_turns) + 1  # If few speakers, no one excluded
    else:
        top_turns_threshold = all_turns[top_n_index - 1]

    analysts = []
    executives = []

    # Classify speakers
    for speaker, stats in speaker_stats.items():
        avg_qm = stats["avg_qm"]
        turns = stats["turns"]
        if avg_qm > questionmark_threshold and turns <= top_turns_threshold:
            analysts.append(speaker)
        elif turns > top_turns_threshold:
            executives.append(speaker)

    return analysts, executives

analysts, executives = determine_analyst_exec(speaker_text_list)
print("Analysts:", analysts)
print("Executives:", executives)

Analysts: ['Alvaro Serrano, Morgan Stanley', 'Guy Stebbings, BNP Paribas Exane', 'Jason Napier, UBS', 'Rob Noble, Deutsche Bank', 'Chris Cant, Autonomous', 'Jonathan Pierce, Jefferies', 'Jonathan Pierce', 'Chris Hallam, Goldman Sachs', 'Andrew Coombs, Citigroup', 'Amit Goel, Mediobanca', 'Perlie Mong, Bank of America']
Executives: ['Anna Cross', 'C.S. Venkatakrishnan']


In [6]:
# Function to couple questions and answers
def pair_questions_answers(speaker_text_list, analysts, executives):
  qa_pairs = []
  questioner = None
  questions = None
  answerers = []
  answers = None

  # For all pairs
  for speaker, text in speaker_text_list:
    if speaker in analysts:
      if questioner != speaker and questioner is not None:
        qa_pairs.append((questioner, questions, answerers, answers))
        answerers = []
        answers = None

      questioner = speaker
      questions = text

    elif speaker in executives:
      if speaker not in answerers:
        answerers.append(speaker)
      if answers is None:
        answers = text
      else:
        answers += " " + text

  # Add the last Q&A pair if it exists
  if questioner is not None and (answerers or questions is not None):
      qa_pairs.append((questioner, questions, answerers, answers))

  return qa_pairs

qa_pairs = pair_questions_answers(speaker_text_list, analysts, executives)

for qa in qa_pairs[:5]:
  print(f"Questioner: {qa[0]}\nQuestions: {qa[1]}\nAnswerers: {qa[2]}\nAnswers: {qa[3]}\n")

Questioner: Alvaro Serrano, Morgan Stanley
Questions: Hi, good morning. A couple of questions, please. First of all, on capital and then one on the Investment Bank. On capital, you are at 13.7% after the buyback. That’s a pretty comfortable position given [your 13-14% range]. [Fundamental Review of the Trading Book (FRTB)] has been delayed, when you think about your capital position versus the over £10 billion distribution, and obviously M&A options that have appeared in the press, how do you see upside to distribution versus additional firepower for M&A? Whether that's a portfolio in the US or something else, if you can maybe walk us through your thinking. And second, on the Investment Bank, obviously Trading, you have explained Anna, very strong. Clearly doing better than US peers and that's worked very well, but Investment Banking fees not so well. So in a world where we seem to be heading, famous last words, to a low volatility environment, I know you've touched on it at the end of

In [7]:
print(f"Number of Q&A pairs: {len(qa_pairs)}")

Number of Q&A pairs: 11


In [8]:
!pip install openai



In [9]:
from openai import OpenAI

# Specify personal key
client = OpenAI(
  api_key=""
)

In [33]:
# Function to extract questions from text possibly containing multiple questions
def extract_questions_answers(questions, answers):
  # Prompt to instruct the model
  prompt = f"""
  Extract each distinct question from the following question transcript.
  Keep multi-sentence questions grouped together so that the supporting context remains intact.

  Then extract the answers given to the extracted questions from the following answer transcript.
  Keep multi-sentence answers grouped together so that the supporting context remains intact.
  If a question is not answered, state 'NOT ANSWERED'.

  For each [Question, Answer] pair, also assign a numerical score between 0 and 1 with 1 decimal for how directly the answer addresses the question where a score of 1 means the question was fully answered and a score of 0 means the question was fully avoided.

  Return your response in **strict Python list format**:

  [
    ['question 1', 'answer 1', answer_score],
    ['question 2', 'answer 2', answer_score],
    ...
  ]

  Question transcript:
  {questions}

  Answer transcript:
  {answers}
  """

  # Call Open AI model
  response = client.chat.completions.create(
      model="gpt-5-mini",
      messages=[
          {"role": "system", "content": "You are an assistant that extracts and matches questions and answers from Q&A transcripts, and checks to which extent the question was answered or avoided."},
          {"role": "user", "content": prompt}
      ]
  )
  return response.choices[0].message.content

In [34]:
# Function to check question orientation and theme
def check_question_properties(question):
  prompt = f"""
  Classify the following question along two dimensions:

  1. Orientation:
      - 'Past' → asking about past or current performance
      - 'Future' → asking about guidance, expectations, or outlook
      - 'Mixed' → contains both past and future elements

  2. Theme:
      Choose one from the following:
      - 'Profitability'
      - 'Capital & Liquidity'
      - 'Macro & Geopolitical influences'
      - 'Regulatory & Legal'
      - 'Risk management'
      - 'Technology & Innovation'
      - 'Sustainability'
      - 'Strategy & Management'
      - 'Other' (if none apply)

  Return your response in **strict Python list format**:
  ['orientation', 'theme']

  Question:
  {question}
  """

  # Call Open AI model
  response = client.chat.completions.create(
      model="gpt-5-nano",
      messages=[
          {"role": "system", "content": "You are an assistant that classifies financial Q&A questions."},
          {"role": "user", "content": prompt}
      ]
  )
  return response.choices[0].message.content

In [14]:
import json
import ast

In [35]:
# Extract all questions and answers
questions = []
answers = []
answer_scores = []

# For all extracted pairs
for i, pair in enumerate(qa_pairs):
  LLM_output_str = extract_questions_answers(pair[1], pair[3])

  try:
    # Safely evaluate the string to a Python list
    LLM_output_list = ast.literal_eval(LLM_output_str)

    # Check if the evaluated output is a list and process it
    if isinstance(LLM_output_list, list):
      for item in LLM_output_list:
        # Check if each item is a list with at least three elements
        if isinstance(item, list) and len(item) >= 3:
          questions.append(item[0])
          answers.append(item[1])
          answer_scores.append(item[2])
        else:
          print(f"Skipping invalid item in LLM output: {item}")
      print(f"Processed block: {i+1} of {len(qa_pairs)}, extracted {len(LLM_output_list)} questions and answers.")
    else:
      print(f"LLM output is not a list: {LLM_output_str}")

  except (ValueError, SyntaxError) as e:
    print(f"Could not parse LLM output string: {LLM_output_str} - Error: {e}")


# Store questions and answers and scores in file and combine into list of dicts
qna_data = [{"question": q, "answer": a, "answer_score": s} for q, a, s in zip(questions, answers, answer_scores)]

# Save
with open("qna_data.json", "w", encoding="utf-8") as f:
    json.dump(qna_data, f, ensure_ascii=False, indent=2)

Processed block: 1 of 11, extracted 2 questions and answers.
Processed block: 2 of 11, extracted 2 questions and answers.
Processed block: 3 of 11, extracted 2 questions and answers.
Processed block: 4 of 11, extracted 4 questions and answers.
Processed block: 5 of 11, extracted 3 questions and answers.
Processed block: 6 of 11, extracted 2 questions and answers.
Processed block: 7 of 11, extracted 1 questions and answers.
Processed block: 8 of 11, extracted 2 questions and answers.
Processed block: 9 of 11, extracted 2 questions and answers.
Processed block: 10 of 11, extracted 2 questions and answers.
Processed block: 11 of 11, extracted 2 questions and answers.


In [36]:
# Load data
with open("qna_data.json", "r", encoding="utf-8") as f:
    loaded_qna = json.load(f)

# Print separate items
print(loaded_qna[0]["question"])
print(loaded_qna[0]["answer"])
print(loaded_qna[0]["answer_score"])

Hi, good morning. A couple of questions, please. First of all, on capital and then one on the Investment Bank. On capital, you are at 13.7% after the buyback. That’s a pretty comfortable position given [your 13-14% range]. [Fundamental Review of the Trading Book (FRTB)] has been delayed, when you think about your capital position versus the over £10 billion distribution, and obviously M&A options that have appeared in the press, how do you see upside to distribution versus additional firepower for M&A? Whether that's a portfolio in the US or something else, if you can maybe walk us through your thinking.
Good morning, Alvaro, thank you very much for kicking off the call for us. I'll take the first question and then I'm going to hand to Venkat. Our capital position just reflects the execution of the strategy. The strategy is designed to create higher and more consistent returns, which in turn allows us to return more to shareholders and indeed invest in the business, and that's really w

In [37]:
# Check the properties of every question in the data and store
for question in loaded_qna:
  question_properties = check_question_properties(question["question"])

  # Check if list format
  try:
    question_properties_list = ast.literal_eval(question_properties)

    if isinstance(question_properties_list, list) and len(question_properties_list) == 2:
      question["orientation"] = question_properties_list[0]
      question["theme"] = question_properties_list[1]
      print(f"Processed question: {question['question']}")
    else:
      print(f"Skipping invalid item in LLM output: {question_properties_list}")

  except (ValueError, SyntaxError) as e:
    print(f"Could not parse LLM output string: {question_properties} - Error: {e}")


# Save file
with open("qna_data.json", "w", encoding="utf-8") as f:
    json.dump(loaded_qna, f, ensure_ascii=False, indent=2)

Processed question: Hi, good morning. A couple of questions, please. First of all, on capital and then one on the Investment Bank. On capital, you are at 13.7% after the buyback. That’s a pretty comfortable position given [your 13-14% range]. [Fundamental Review of the Trading Book (FRTB)] has been delayed, when you think about your capital position versus the over £10 billion distribution, and obviously M&A options that have appeared in the press, how do you see upside to distribution versus additional firepower for M&A? Whether that's a portfolio in the US or something else, if you can maybe walk us through your thinking.
Processed question: And second, on the Investment Bank, obviously Trading, you have explained Anna, very strong. Clearly doing better than US peers and that's worked very well, but Investment Banking fees not so well. So in a world where we seem to be heading, famous last words, to a low volatility environment, I know you've touched on it at the end of the quarter, 

In [38]:
# Load data
with open("qna_data.json", "r", encoding="utf-8") as f:
    loaded_qna = json.load(f)

# Convert data to dataframe
df = pd.DataFrame(loaded_qna)
print(df)

                                             question  \
0   Hi, good morning. A couple of questions, pleas...   
1   And second, on the Investment Bank, obviously ...   
2   The first one was on Barclays UK, I was wonder...   
3   The second question was on the US Consumer bus...   
4   Good morning, Venkat and Anna, thanks for taki...   
5   Secondly, perhaps just as a follow on, consens...   
6   Can I just ask on the promotional cards, aren’...   
7   If you could just elaborate on the size of the...   
8   There's been sort of one quarter’s growth in t...   
9   Lastly, how big is the Kensington book now and...   
10  So firstly, thinking about what's happening in...   
11  With regards to [Comprehensive Capital Analysi...   
12  On the UK side of the fence, if I could just i...   
13  Hello both, two questions please. The first is...   
14  The second question, at the moment, I understa...   
15  Understood. I don't suppose any chance you cou...   
16  Good morning everybody. Two

In [39]:
# Save as csv
pd.DataFrame(loaded_qna).to_csv("qna_data.csv", index=False, sep=";")