**In this Notebook we extract the remaining SubScore Labels**

In [None]:
from langchain_ollama import OllamaLLM
from tqdm import tqdm

import json
import numpy as np
import os
import pandas as pd
import re

**Code to extract unique score combinations**

In [None]:
!pip install --upgrade numpy

In [None]:
rms_with_fundamental_score = pd.read_pickle('./rms_with_fundamental_score.pkl')
rms_with_fundamental_score.head(1)

In [None]:
unique_score_combinations = rms_with_fundamental_score[['CategoryGroup', 'Category', 'TaggedCharacteristics']].drop_duplicates()
# Replace all types of newlines and excessive whitespace in TaggedCharacteristics
unique_score_combinations['TaggedCharacteristics'] = unique_score_combinations['TaggedCharacteristics'].str.replace(r'[\r\n]+', ' ', regex=True)

# Function to expand TaggedCharacteristics if it's a JSON string with multiple items
def expand_tagged_characteristics(row):
    try:
        characteristics = json.loads(row['TaggedCharacteristics'])
        if isinstance(characteristics, list):
            # Replace newlines within each CharacteristicText
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': char['CharacteristicText'].replace('\r', ' ').replace('\n', ' '),
                'CharacteristicInfluence': char.get('CharacteristicInfluence', None)  # Handle missing keys
            } for char in characteristics])
        else:
            # If it's a single item or not a list, replace newlines if it's a string
            if isinstance(characteristics, str):
                characteristics = characteristics.replace('\r', ' ').replace('\n', ' ')
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': characteristics,
                'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
            }])
    except (json.JSONDecodeError, TypeError):
        # If parsing fails, replace newlines in the original TaggedCharacteristics
        cleaned_text = row['TaggedCharacteristics'].replace('\r', ' ').replace('\n', ' ')
        return pd.DataFrame([{
            'CategoryGroup': row['CategoryGroup'],
            'Category': row['Category'],
            'TaggedCharacteristics': cleaned_text,
            'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
        }])

# Applying the function to each row and combining results
expanded_unique_score_combinations = pd.concat(
    unique_score_combinations.apply(expand_tagged_characteristics, axis=1).to_list(),
    ignore_index=True
)

# Sort, drop duplicates, and save to CSV
unique_score_combinations = expanded_unique_score_combinations.sort_values(by=['CategoryGroup', 'Category', 'CharacteristicInfluence']).drop_duplicates()
unique_score_combinations.to_csv('unique_score_combinations.csv', index=False)

**Defining the questions and instantiating the LLM**

In [11]:
# Define the questions corresponding to each column
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}
questions_intra_industry_competition = {
    "Intra-Industry Competition - a": "Does the text mention that market pricing for the company's products or services is irrational or not based on fundamental factors?",
    "Intra-Industry Competition - b": "Does the text mention that the market is highly fragmented with no clear leader or that there is only one dominant leader?",
    "Intra-Industry Competition - c": "Does the text mention low barriers to entry in the industry, making it easy for new competitors to enter the market?"
}
questions_regulatory_framework = {
    "Regulatory Framework - a": "Does the text mention that the industry is subject to a high degree of regulatory scrutiny?",
    "Regulatory Framework - b": "Does the text mention a high dependency on regulation or being a beneficiary from regulation in an unstable regulatory environment?"
}
questions_technology_risk = {
    "Technology Risk - a": "Does the text mention that the industry is susceptible to rapid technological advances or innovations?",
    "Technology Risk - b": "Does the text mention that the company is perceived as a disruptor or is threatened by emerging technological changes?"
}

all_question_dicts = [
    questions_market_dynamics,
    questions_intra_industry_competition,
    questions_regulatory_framework,
    questions_technology_risk
]

# Original questions
questions_market_dynamics_original = {
    "Market Dynamics - a": "Exposure to cyclical products",
    "Market Dynamics - b": "Impact of demographic and structural trends",
    "Market Dynamics - c": "Seasonal industry volatility"
}
questions_intra_industry_competition_original = {
    "Intra-Industry Competition - a": "Market pricing has not shown to be rational",
    "Intra-Industry Competition - b": "Highly fragmented market with no clear leader or only one leader",
    "Intra-Industry Competition - c": "Low barriers to entry"
}
questions_regulatory_framework_original = {
    "Regulatory Framework - a": "Industry has high degree of regulatory scrutiny",
    "Regulatory Framework - b": "High dependency on regulation or is a beneficiary from regulation in an unstable regulatory environment"
}
questions_technology_risk_original = {
    "Technology Risk - a": "Industry susceptibility to technological advances",
    "Technology Risk - b": "Company viewed as a disruptee/threatened by technological change"
}


In [12]:
# Initialize the language model
llm = OllamaLLM(model="llama3.2")

# Check if the processed file exists; if not, process the raw data
processed_file_path = '../data/prospectuses_data_processed.csv'
raw_file_path = '../data/prospectuses_data.csv'

# Check if processed file exists
if os.path.exists(processed_file_path):
    df = pd.read_csv(processed_file_path)
else:
    print("Processed file not found. Processing raw data...")
    df = pd.read_csv(raw_file_path)
    # Filter out rows that have "failed parsing" in the Section ID column
    df = df[df['Section ID'] != "failed parsing"]

# Ensure the relevance and evidence columns are created with a compatible data type
for question_dict in all_question_dicts:
    # Iterate through each question key in the current dictionary
    for column_name in question_dict.keys():
        if column_name in df.columns:
            df[column_name] = df[column_name].astype('string')
        else:
            df[column_name] = ""

df.head(2)
# print(df.shape)

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Market Dynamics - a,Market Dynamics - b,Market Dynamics - c,Parsing Error,Intra-Industry Competition - a,Intra-Industry Competition - b,Intra-Industry Competition - c,Regulatory Framework - a,Regulatory Framework - b,Technology Risk - a,Technology Risk - b
0,235,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,,1.1.1,,_An investment in the Notes involves a high de...,Not Relevant,Highly Relevant: the risks described below,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Highly Relevant: Subsubsection Title: ... and ...,Highly Relevant,Not Relevant,Not Relevant
1,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.1,The Group faces significant competition in eac...,The French telecommunications market is a matu...,Highly Relevant: Various evidence throughout t...,Highly Relevant,Highly Relevant,,Highly Relevant,Highly Relevant: ...the Group also competes wi...,Highly Relevant: The exact phrases or sentence...,Highly Relevant: Several evidence are presente...,Highly Relevant,Highly Relevant: This is a highly relevant ans...,Highly Relevant: The Group also faces competit...


In [13]:
def extract_fields(response):
    # Remove any newlines and extra spaces
    response = ' '.join(response.strip().split())

    # Extract the Relevance field
    relevance_match = re.search(r'"Relevance"\s*:\s*"([^"]+)"', response)
    if relevance_match:
        relevance = relevance_match.group(1).strip()
    else:
        relevance = "Parsing Error"

    # Extract the Evidence field(s)
    evidence_match = re.search(r'"Evidence"\s*:\s*(.+?)(?:,?\s*"[^"]+"\s*:|\s*}$)', response)
    if evidence_match:
        evidence_str = evidence_match.group(1).strip()
        # Remove any trailing commas or braces
        evidence_str = evidence_str.rstrip(', }')
        # Split the evidence_str into individual evidence items
        # Evidence items are strings enclosed in double quotes
        evidence_items = re.findall(r'"([^"]+)"', evidence_str)
        evidence = evidence_items
    else:
        evidence = []

    return relevance, evidence


def analyze_prospectus_row_single_question(row, question):
    # System and user prompts
    system_prompt = "You are an expert in analyzing bond prospectuses and identifying specific risk factors."

    # Format the user prompt using the row's data
    prompt = f"""
{system_prompt}

For the following question and text, judge whether the text is "Highly Relevant", "Somewhat Relevant", or "Not Relevant".

Question:
{question}

Text:
Subsubsection Title: {row['Subsubsection Title']}
Subsubsection Text: {row['Subsubsection Text']}


Please provide your answer in the following JSON format:

{{
  "Relevance": "Highly Relevant", "Somewhat Relevant", or "Not Relevant",
  "Evidence": "The exact phrases or sentences from the document that support your assessment; otherwise, leave blank."
}}

Note: Only provide the JSON response without any additional text.
"""
    # Run the prompt through the model
    response = llm.invoke(input=prompt)

    # Parse the response
    try:
        # Extract the Relevance and Evidence fields
        relevance, evidence_list = extract_fields(response)
        # Join multiple evidence items into a single string
        evidence = '; '.join(evidence_list)
    except Exception as e:
        relevance = "Parsing Error"
        evidence = ""

    # Combine relevance and evidence
    if relevance in ["Highly Relevant", "Somewhat Relevant"] and evidence:
        combined_answer = f"{relevance}: {evidence}"
    elif relevance in ["Highly Relevant", "Somewhat Relevant"]:
        combined_answer = relevance
    elif relevance == "Not Relevant":
        combined_answer = "Not Relevant"
    else:
        combined_answer = "Parsing Error"

    # For debugging
    if combined_answer == "Parsing Error":
        print("Parsing Error encountered. Response was:")
        print(response)

    return combined_answer

**Run the LLM Processing**

In [None]:
# Iterate over each row in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Rows"):
    for question_dict in all_question_dicts:
        for column_name, question in question_dict.items():
            # Check if the answer column is already filled
            if pd.notnull(df.at[index, column_name]) and df.at[index, column_name] != "":
                # Skip processing this row for this question
                continue
            combined_answer = analyze_prospectus_row_single_question(row, question)
            df.at[index, column_name] = combined_answer

    # Save progress every 50 rows (adjusted from 35 to match the comment)
    if (index + 1) % 2 == 0:
        df.to_csv(processed_file_path, index=False)
        #print(f"Progress saved at row {index + 1}")
        if (index + 1) % 10 == 0:
            print(f"That was 10 more. Currently: {index + 1}")


# Save the final DataFrame after processing all rows
df.to_csv(processed_file_path, index=False)
print("All rows have been processed and saved.")

Processing Rows:   0%|          | 10/7952 [00:01<17:34,  7.53it/s]

That was 10 more. Currently: 10


Processing Rows:   0%|          | 20/7952 [01:30<33:32:14, 15.22s/it]

That was 10 more. Currently: 20


Processing Rows:   0%|          | 30/7952 [04:50<55:18:22, 25.13s/it]

That was 10 more. Currently: 30


Processing Rows:   1%|          | 40/7952 [07:56<42:39:27, 19.41s/it]

That was 10 more. Currently: 40


Processing Rows:   1%|          | 50/7952 [12:06<63:02:04, 28.72s/it]

That was 10 more. Currently: 50


Processing Rows:   1%|          | 58/7952 [14:31<36:21:16, 16.58s/it]

Parsing Error encountered. Response was:
{"Relevance": "Very Relevant", "Evidence": "net financial charges, which are defined as the portion of financial charges exceeding financial income, accrued by companies that are subject to French corporate income tax..."}


Processing Rows:   1%|          | 60/7952 [15:44<54:22:32, 24.80s/it]

That was 10 more. Currently: 60


Processing Rows:   1%|          | 70/7952 [18:59<42:59:16, 19.63s/it]

That was 10 more. Currently: 70


Processing Rows:   1%|          | 80/7952 [22:28<44:07:55, 20.18s/it]

That was 10 more. Currently: 80


Processing Rows:   1%|          | 90/7952 [25:00<43:21:54, 19.86s/it]

That was 10 more. Currently: 90


Processing Rows:   1%|▏         | 100/7952 [28:37<50:15:56, 23.05s/it]

That was 10 more. Currently: 100


Processing Rows:   1%|▏         | 110/7952 [30:42<27:01:24, 12.41s/it]

That was 10 more. Currently: 110


Processing Rows:   2%|▏         | 120/7952 [34:07<37:04:54, 17.04s/it]

That was 10 more. Currently: 120


Processing Rows:   2%|▏         | 130/7952 [37:32<66:15:28, 30.49s/it]

That was 10 more. Currently: 130


Processing Rows:   2%|▏         | 140/7952 [41:02<47:12:42, 21.76s/it]

That was 10 more. Currently: 140


Processing Rows:   2%|▏         | 150/7952 [44:09<47:58:20, 22.14s/it]

That was 10 more. Currently: 150


Processing Rows:   2%|▏         | 159/7952 [46:27<35:15:52, 16.29s/it]

Parsing Error encountered. Response was:
{
  "Relevance": "Very High",
  "Evidence": "The entire text"
}


Processing Rows:   2%|▏         | 160/7952 [47:31<66:03:16, 30.52s/it]

That was 10 more. Currently: 160


Processing Rows:   2%|▏         | 170/7952 [51:19<51:17:12, 23.73s/it]

That was 10 more. Currently: 170


Processing Rows:   2%|▏         | 180/7952 [54:32<39:24:57, 18.26s/it]

That was 10 more. Currently: 180


Processing Rows:   2%|▏         | 190/7952 [56:44<25:34:20, 11.86s/it]

That was 10 more. Currently: 190


Processing Rows:   3%|▎         | 200/7952 [1:00:07<47:16:57, 21.96s/it]

That was 10 more. Currently: 200


Processing Rows:   3%|▎         | 210/7952 [1:03:55<44:22:38, 20.64s/it]

That was 10 more. Currently: 210


Processing Rows:   3%|▎         | 220/7952 [1:06:23<40:35:39, 18.90s/it]

That was 10 more. Currently: 220


Processing Rows:   3%|▎         | 230/7952 [1:09:24<38:45:41, 18.07s/it]

That was 10 more. Currently: 230


Processing Rows:   3%|▎         | 240/7952 [1:12:27<40:15:24, 18.79s/it]

That was 10 more. Currently: 240


Processing Rows:   3%|▎         | 250/7952 [1:16:20<53:05:53, 24.82s/it]

That was 10 more. Currently: 250


Processing Rows:   3%|▎         | 260/7952 [1:19:24<45:35:57, 21.34s/it]

That was 10 more. Currently: 260


Processing Rows:   3%|▎         | 270/7952 [1:22:53<39:45:00, 18.63s/it]

That was 10 more. Currently: 270


Processing Rows:   4%|▎         | 280/7952 [1:25:01<28:53:35, 13.56s/it]

That was 10 more. Currently: 280


Processing Rows:   4%|▎         | 290/7952 [1:28:49<44:18:04, 20.81s/it]

That was 10 more. Currently: 290


Processing Rows:   4%|▍         | 300/7952 [1:32:03<36:58:22, 17.39s/it]

That was 10 more. Currently: 300


Processing Rows:   4%|▍         | 310/7952 [1:35:55<37:47:29, 17.80s/it]

That was 10 more. Currently: 310


Processing Rows:   4%|▍         | 320/7952 [1:38:16<30:09:45, 14.23s/it]

That was 10 more. Currently: 320


Processing Rows:   4%|▍         | 330/7952 [1:41:53<36:51:24, 17.41s/it]

That was 10 more. Currently: 330


Processing Rows:   4%|▍         | 340/7952 [1:46:34<67:19:01, 31.84s/it]

That was 10 more. Currently: 340
Parsing Error encountered. Response was:
{"Relevance": "Very Highly Relevant", "Evidence": "The phrases and sentences explaining the limitations on the validity and enforceability of the Note Guarantees and Security Interests under Italian law."}


Processing Rows:   4%|▍         | 350/7952 [1:50:07<34:36:22, 16.39s/it]

That was 10 more. Currently: 350


Processing Rows:   5%|▍         | 360/7952 [1:54:05<55:07:10, 26.14s/it]

That was 10 more. Currently: 360


Processing Rows:   5%|▍         | 370/7952 [1:57:20<35:12:23, 16.72s/it]

That was 10 more. Currently: 370


Processing Rows:   5%|▍         | 380/7952 [2:00:14<29:50:01, 14.18s/it]

That was 10 more. Currently: 380


Processing Rows:   5%|▍         | 390/7952 [2:03:19<42:42:59, 20.34s/it]

That was 10 more. Currently: 390


Processing Rows:   5%|▌         | 400/7952 [2:07:12<43:40:11, 20.82s/it]

That was 10 more. Currently: 400


Processing Rows:   5%|▌         | 410/7952 [2:10:25<43:26:15, 20.73s/it]

That was 10 more. Currently: 410


Processing Rows:   5%|▌         | 420/7952 [2:13:38<39:36:02, 18.93s/it]

That was 10 more. Currently: 420


Processing Rows:   5%|▌         | 430/7952 [2:16:24<36:47:48, 17.61s/it]

That was 10 more. Currently: 430


Processing Rows:   6%|▌         | 440/7952 [2:20:43<47:33:36, 22.79s/it]

That was 10 more. Currently: 440


Processing Rows:   6%|▌         | 450/7952 [2:23:46<36:10:33, 17.36s/it]

That was 10 more. Currently: 450


Processing Rows:   6%|▌         | 460/7952 [2:27:26<48:22:10, 23.24s/it]

That was 10 more. Currently: 460


Processing Rows:   6%|▌         | 470/7952 [2:31:20<45:53:50, 22.08s/it]

That was 10 more. Currently: 470


Processing Rows:   6%|▌         | 480/7952 [2:35:10<60:49:09, 29.30s/it]

That was 10 more. Currently: 480


Processing Rows:   6%|▌         | 490/7952 [2:38:57<50:27:21, 24.34s/it]

That was 10 more. Currently: 490


Processing Rows:   6%|▋         | 500/7952 [2:42:00<40:25:55, 19.53s/it]

That was 10 more. Currently: 500


Processing Rows:   6%|▋         | 510/7952 [2:44:53<30:15:33, 14.64s/it]

That was 10 more. Currently: 510


Processing Rows:   7%|▋         | 520/7952 [2:46:38<18:38:50,  9.03s/it]

That was 10 more. Currently: 520


Processing Rows:   7%|▋         | 530/7952 [2:48:10<21:06:21, 10.24s/it]

That was 10 more. Currently: 530


Processing Rows:   7%|▋         | 540/7952 [2:50:18<29:42:10, 14.43s/it]

That was 10 more. Currently: 540


Processing Rows:   7%|▋         | 550/7952 [2:54:12<46:07:41, 22.43s/it]

That was 10 more. Currently: 550


Processing Rows:   7%|▋         | 560/7952 [2:56:38<27:36:20, 13.44s/it]

That was 10 more. Currently: 560


Processing Rows:   7%|▋         | 570/7952 [3:00:27<41:11:25, 20.09s/it]

That was 10 more. Currently: 570


Processing Rows:   7%|▋         | 580/7952 [3:03:07<35:00:58, 17.10s/it]

That was 10 more. Currently: 580


Processing Rows:   7%|▋         | 590/7952 [3:06:49<43:01:01, 21.04s/it]

That was 10 more. Currently: 590


Processing Rows:   8%|▊         | 600/7952 [3:10:21<46:31:22, 22.78s/it]

That was 10 more. Currently: 600


Processing Rows:   8%|▊         | 610/7952 [3:13:37<27:00:29, 13.24s/it]

That was 10 more. Currently: 610


Processing Rows:   8%|▊         | 620/7952 [3:16:10<33:53:39, 16.64s/it]

That was 10 more. Currently: 620


Processing Rows:   8%|▊         | 630/7952 [3:18:19<23:22:31, 11.49s/it]

That was 10 more. Currently: 630


Processing Rows:   8%|▊         | 640/7952 [3:21:29<37:33:30, 18.49s/it]

That was 10 more. Currently: 640


Processing Rows:   8%|▊         | 650/7952 [3:23:40<18:15:23,  9.00s/it]

That was 10 more. Currently: 650


Processing Rows:   8%|▊         | 659/7952 [3:26:25<45:33:15, 22.49s/it]

Parsing Error encountered. Response was:
{
  "Relevance": "Very Relevant",
  "Evidence": "Financial income increased by €1.0 million, or 81.6%, to €2.2 million for the year ended December 31, 2019 as compared to €1.2 million for the year ended December 31, 2018..."
}


Processing Rows:   8%|▊         | 660/7952 [3:27:28<70:05:15, 34.60s/it]

That was 10 more. Currently: 660


Processing Rows:   8%|▊         | 670/7952 [3:29:49<32:25:33, 16.03s/it]

That was 10 more. Currently: 670


Processing Rows:   9%|▊         | 680/7952 [3:31:37<19:39:30,  9.73s/it]

That was 10 more. Currently: 680


Processing Rows:   9%|▊         | 690/7952 [3:33:17<19:02:06,  9.44s/it]

That was 10 more. Currently: 690


Processing Rows:   9%|▉         | 700/7952 [3:34:56<20:58:18, 10.41s/it]

That was 10 more. Currently: 700


Processing Rows:   9%|▉         | 710/7952 [3:36:48<23:51:44, 11.86s/it]

That was 10 more. Currently: 710


Processing Rows:   9%|▉         | 720/7952 [3:40:05<37:20:16, 18.59s/it]

That was 10 more. Currently: 720


Processing Rows:   9%|▉         | 730/7952 [3:44:08<48:54:33, 24.38s/it]

That was 10 more. Currently: 730


Processing Rows:   9%|▉         | 740/7952 [3:48:48<43:11:38, 21.56s/it]

That was 10 more. Currently: 740


Processing Rows:   9%|▉         | 747/7952 [3:51:21<40:11:43, 20.08s/it]