**In this Notebook we extract the remaining SubScore Labels**

In [1]:
from langchain_ollama import OllamaLLM
from tqdm import tqdm

import json
import numpy as np
import os
import pandas as pd
import re

**Code to extract unique score combinations**

In [None]:
!pip install --upgrade numpy

In [None]:
rms_with_fundamental_score = pd.read_pickle('./rms_with_fundamental_score.pkl')
rms_with_fundamental_score.head(1)

In [None]:
unique_score_combinations = rms_with_fundamental_score[['CategoryGroup', 'Category', 'TaggedCharacteristics']].drop_duplicates()
# Replace all types of newlines and excessive whitespace in TaggedCharacteristics
unique_score_combinations['TaggedCharacteristics'] = unique_score_combinations['TaggedCharacteristics'].str.replace(r'[\r\n]+', ' ', regex=True)

# Function to expand TaggedCharacteristics if it's a JSON string with multiple items
def expand_tagged_characteristics(row):
    try:
        characteristics = json.loads(row['TaggedCharacteristics'])
        if isinstance(characteristics, list):
            # Replace newlines within each CharacteristicText
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': char['CharacteristicText'].replace('\r', ' ').replace('\n', ' '),
                'CharacteristicInfluence': char.get('CharacteristicInfluence', None)  # Handle missing keys
            } for char in characteristics])
        else:
            # If it's a single item or not a list, replace newlines if it's a string
            if isinstance(characteristics, str):
                characteristics = characteristics.replace('\r', ' ').replace('\n', ' ')
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': characteristics,
                'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
            }])
    except (json.JSONDecodeError, TypeError):
        # If parsing fails, replace newlines in the original TaggedCharacteristics
        cleaned_text = row['TaggedCharacteristics'].replace('\r', ' ').replace('\n', ' ')
        return pd.DataFrame([{
            'CategoryGroup': row['CategoryGroup'],
            'Category': row['Category'],
            'TaggedCharacteristics': cleaned_text,
            'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
        }])

# Applying the function to each row and combining results
expanded_unique_score_combinations = pd.concat(
    unique_score_combinations.apply(expand_tagged_characteristics, axis=1).to_list(),
    ignore_index=True
)

# Sort, drop duplicates, and save to CSV
unique_score_combinations = expanded_unique_score_combinations.sort_values(by=['CategoryGroup', 'Category', 'CharacteristicInfluence']).drop_duplicates()
unique_score_combinations.to_csv('unique_score_combinations.csv', index=False)

**Defining the questions and instantiating the LLM**

In [2]:
# Define the questions corresponding to each column
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}
questions_intra_industry_competition = {
    "Intra-Industry Competition - a": "Does the text mention that market pricing for the company's products or services is irrational or not based on fundamental factors?",
    "Intra-Industry Competition - b": "Does the text mention that the market is highly fragmented with no clear leader or that there is only one dominant leader?",
    "Intra-Industry Competition - c": "Does the text mention low barriers to entry in the industry, making it easy for new competitors to enter the market?"
}
questions_regulatory_framework = {
    "Regulatory Framework - a": "Does the text mention that the industry is subject to a high degree of regulatory scrutiny?",
    "Regulatory Framework - b": "Does the text mention a high dependency on regulation or being a beneficiary from regulation in an unstable regulatory environment?"
}
questions_technology_risk = {
    "Technology Risk - a": "Does the text mention that the industry is susceptible to rapid technological advances or innovations?",
    "Technology Risk - b": "Does the text mention that the company is perceived as a disruptor or is threatened by emerging technological changes?"
}

all_question_dicts = [
    questions_market_dynamics,
    questions_intra_industry_competition,
    questions_regulatory_framework,
    questions_technology_risk
]

# Original questions
questions_market_dynamics_original = {
    "Market Dynamics - a": "Exposure to cyclical products",
    "Market Dynamics - b": "Impact of demographic and structural trends",
    "Market Dynamics - c": "Seasonal industry volatility"
}
questions_intra_industry_competition_original = {
    "Intra-Industry Competition - a": "Market pricing has not shown to be rational",
    "Intra-Industry Competition - b": "Highly fragmented market with no clear leader or only one leader",
    "Intra-Industry Competition - c": "Low barriers to entry"
}
questions_regulatory_framework_original = {
    "Regulatory Framework - a": "Industry has high degree of regulatory scrutiny",
    "Regulatory Framework - b": "High dependency on regulation or is a beneficiary from regulation in an unstable regulatory environment"
}
questions_technology_risk_original = {
    "Technology Risk - a": "Industry susceptibility to technological advances",
    "Technology Risk - b": "Company viewed as a disruptee/threatened by technological change"
}


In [3]:
# Initialize the language model
llm = OllamaLLM(model="llama3.2")

# Check if the processed file exists; if not, process the raw data
processed_file_path = '../data/prospectuses_data_processed.csv'
raw_file_path = '../data/prospectuses_data.csv'

# Check if processed file exists
if os.path.exists(processed_file_path):
    df = pd.read_csv(processed_file_path)
else:
    print("Processed file not found. Processing raw data...")
    df = pd.read_csv(raw_file_path)
    # Filter out rows that have "failed parsing" in the Section ID column
    df = df[df['Section ID'] != "failed parsing"]

# Ensure the relevance and evidence columns are created with a compatible data type
for question_dict in all_question_dicts:
    # Iterate through each question key in the current dictionary
    for column_name in question_dict.keys():
        if column_name in df.columns:
            df[column_name] = df[column_name].astype('string')
        else:
            df[column_name] = ""

df.head(2)
# print(df.shape)

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Market Dynamics - a,Market Dynamics - b,Market Dynamics - c,Parsing Error,Intra-Industry Competition - a,Intra-Industry Competition - b,Intra-Industry Competition - c,Regulatory Framework - a,Regulatory Framework - b,Technology Risk - a,Technology Risk - b
0,235,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,,1.1.1,,_An investment in the Notes involves a high de...,Not Relevant,Highly Relevant: the risks described below,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Highly Relevant: Subsubsection Title: ... and ...,Highly Relevant,Not Relevant,Not Relevant
1,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.1,The Group faces significant competition in eac...,The French telecommunications market is a matu...,Highly Relevant: Various evidence throughout t...,Highly Relevant,Highly Relevant,,Highly Relevant,Highly Relevant: ...the Group also competes wi...,Highly Relevant: The exact phrases or sentence...,Highly Relevant: Several evidence are presente...,Highly Relevant,Highly Relevant: This is a highly relevant ans...,Highly Relevant: The Group also faces competit...


In [4]:
def extract_fields(response):
    # Remove any newlines and extra spaces
    response = ' '.join(response.strip().split())

    # Extract the Relevance field
    relevance_match = re.search(r'"Relevance"\s*:\s*"([^"]+)"', response)
    if relevance_match:
        relevance = relevance_match.group(1).strip()
    else:
        relevance = "Parsing Error"

    # Extract the Evidence field(s)
    evidence_match = re.search(r'"Evidence"\s*:\s*(.+?)(?:,?\s*"[^"]+"\s*:|\s*}$)', response)
    if evidence_match:
        evidence_str = evidence_match.group(1).strip()
        # Remove any trailing commas or braces
        evidence_str = evidence_str.rstrip(', }')
        # Split the evidence_str into individual evidence items
        # Evidence items are strings enclosed in double quotes
        evidence_items = re.findall(r'"([^"]+)"', evidence_str)
        evidence = evidence_items
    else:
        evidence = []

    return relevance, evidence


def analyze_prospectus_row_single_question(row, question):
    # System and user prompts
    system_prompt = "You are an expert in analyzing bond prospectuses and identifying specific risk factors."

    # Format the user prompt using the row's data
    prompt = f"""
{system_prompt}

For the following question and text, judge whether the text is "Highly Relevant", "Somewhat Relevant", or "Not Relevant".

Question:
{question}

Text:
Subsubsection Title: {row['Subsubsection Title']}
Subsubsection Text: {row['Subsubsection Text']}


Please provide your answer in the following JSON format:

{{
  "Relevance": "Highly Relevant", "Somewhat Relevant", or "Not Relevant",
  "Evidence": "The exact phrases or sentences from the document that support your assessment; otherwise, leave blank."
}}

Note: Only provide the JSON response without any additional text.
"""
    # Run the prompt through the model
    response = llm.invoke(input=prompt)

    # Parse the response
    try:
        # Extract the Relevance and Evidence fields
        relevance, evidence_list = extract_fields(response)
        # Join multiple evidence items into a single string
        evidence = '; '.join(evidence_list)
    except Exception as e:
        relevance = "Parsing Error"
        evidence = ""

    # Combine relevance and evidence
    if relevance in ["Highly Relevant", "Somewhat Relevant"] and evidence:
        combined_answer = f"{relevance}: {evidence}"
    elif relevance in ["Highly Relevant", "Somewhat Relevant"]:
        combined_answer = relevance
    elif relevance == "Not Relevant":
        combined_answer = "Not Relevant"
    else:
        combined_answer = "Parsing Error"

    # For debugging
    if combined_answer == "Parsing Error":
        print("Parsing Error encountered. Response was:")
        print(response)

    return combined_answer

**Run the LLM Processing**

In [5]:
import time

# Initialize counter for new rows processed
new_rows_processed = 0

# Iterate over each row in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Rows"):
    row_processed = False  # Flag to check if we processed any new data in this row

    for question_dict in all_question_dicts:
        for column_name, question in question_dict.items():
            # Check if the answer column is already filled
            if pd.notnull(df.at[index, column_name]) and df.at[index, column_name] != "":
                # Skip processing this row for this question
                continue
            combined_answer = analyze_prospectus_row_single_question(row, question)
            df.at[index, column_name] = combined_answer
            row_processed = True  # We processed new data in this row

    if row_processed:
        new_rows_processed += 1

    # Save progress every 50 rows
    if (index + 1) % 50 == 0:
        df.to_csv(processed_file_path, index=False)
        # print(f"Progress saved at row {index + 1}")

    # After processing 10 new rows, sleep for 30 seconds
    if new_rows_processed >= 10:
        df.to_csv(processed_file_path, index=False)  # Save before sleeping
        print(f"Processed 10 new rows. Pausing for 30 seconds.")
        # time.sleep(30)
        new_rows_processed = 0  # Reset counter

# Save the final DataFrame after processing all rows
df.to_csv(processed_file_path, index=False)
print("All rows have been processed and saved.")

Processing Rows:   0%|          | 0/7952 [00:00<?, ?it/s]

Processing Rows:  92%|█████████▏| 7329/7952 [03:30<1:06:10,  6.37s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  92%|█████████▏| 7339/7952 [06:38<3:06:12, 18.23s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  92%|█████████▏| 7349/7952 [10:01<3:45:28, 22.44s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7359/7952 [11:55<2:06:28, 12.80s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7369/7952 [14:38<2:58:52, 18.41s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7379/7952 [18:30<3:08:27, 19.73s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7389/7952 [21:13<2:50:37, 18.18s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7399/7952 [23:57<2:25:32, 15.79s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7409/7952 [26:27<2:39:26, 17.62s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7419/7952 [29:50<3:10:53, 21.49s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  93%|█████████▎| 7429/7952 [33:25<3:55:50, 27.06s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▎| 7439/7952 [35:33<2:02:23, 14.31s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▎| 7449/7952 [37:58<2:14:05, 16.00s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7459/7952 [40:16<1:44:55, 12.77s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7469/7952 [42:49<2:13:54, 16.63s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7479/7952 [45:09<1:50:58, 14.08s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7489/7952 [47:58<1:47:52, 13.98s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7499/7952 [51:07<3:02:12, 24.13s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  94%|█████████▍| 7509/7952 [54:01<2:14:58, 18.28s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▍| 7519/7952 [56:32<2:14:34, 18.65s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▍| 7529/7952 [59:11<1:45:54, 15.02s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▍| 7539/7952 [1:01:29<1:32:24, 13.42s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▍| 7549/7952 [1:03:30<1:30:55, 13.54s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▌| 7559/7952 [1:05:55<1:29:06, 13.61s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▌| 7569/7952 [1:08:42<1:30:23, 14.16s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▌| 7579/7952 [1:11:08<1:25:01, 13.68s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  95%|█████████▌| 7589/7952 [1:14:11<2:14:02, 22.16s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7599/7952 [1:19:01<2:26:33, 24.91s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7609/7952 [1:22:36<2:16:50, 23.94s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7619/7952 [1:26:29<2:24:32, 26.04s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7629/7952 [1:30:21<1:58:34, 22.03s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7639/7952 [1:33:44<1:28:30, 16.97s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▌| 7649/7952 [1:38:37<2:26:51, 29.08s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▋| 7659/7952 [1:41:30<1:41:55, 20.87s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  96%|█████████▋| 7669/7952 [1:45:49<1:28:53, 18.85s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7679/7952 [1:49:07<1:24:18, 18.53s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7689/7952 [1:52:56<1:11:11, 16.24s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7699/7952 [1:56:45<2:05:14, 29.70s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7709/7952 [2:01:19<1:46:53, 26.39s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7719/7952 [2:06:02<2:01:47, 31.36s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7729/7952 [2:10:04<1:45:50, 28.48s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7739/7952 [2:13:32<1:31:48, 25.86s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  97%|█████████▋| 7749/7952 [2:16:30<56:11, 16.61s/it]  

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7759/7952 [2:19:32<42:47, 13.30s/it]  

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7769/7952 [2:23:20<1:13:06, 23.97s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7779/7952 [2:26:42<53:26, 18.53s/it]  

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7789/7952 [2:29:38<45:47, 16.86s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7799/7952 [2:32:28<49:41, 19.49s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7809/7952 [2:34:58<38:20, 16.09s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7819/7952 [2:37:53<42:21, 19.11s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  98%|█████████▊| 7829/7952 [2:41:17<56:23, 27.51s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▊| 7839/7952 [2:43:56<37:10, 19.74s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▊| 7849/7952 [2:46:54<26:48, 15.62s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7859/7952 [2:49:52<32:33, 21.01s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7869/7952 [2:53:47<27:47, 20.09s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7879/7952 [2:56:42<20:06, 16.53s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7889/7952 [2:59:25<15:29, 14.76s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7899/7952 [3:02:21<15:00, 17.00s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows:  99%|█████████▉| 7909/7952 [3:06:27<15:48, 22.05s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows: 100%|█████████▉| 7919/7952 [3:10:40<12:33, 22.84s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows: 100%|█████████▉| 7929/7952 [3:13:32<07:20, 19.15s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows: 100%|█████████▉| 7939/7952 [3:16:06<02:54, 13.43s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows: 100%|█████████▉| 7949/7952 [3:18:27<00:39, 13.28s/it]

Processed 10 new rows. Pausing for 30 seconds.


Processing Rows: 100%|██████████| 7952/7952 [3:19:09<00:00,  1.50s/it]


All rows have been processed and saved.
