**In this Notebook we extract the remaining SubScore Labels**

In [54]:
from langchain_ollama import OllamaLLM
from tqdm import tqdm

import json
import numpy as np
import os
import pandas as pd
import re

**Code to extract unique score combinations**

In [55]:
rms_with_fundamental_score = pd.read_pickle('./rms_with_fundamental_score.pkl')
rms_with_fundamental_score.head(15)

Unnamed: 0,RmsId,ScoringDate,CategoryGroup,Category,Score,TaggedCharacteristics,CompanyName,Status,SharePointLink,SharePointLinkTruncated
0,194,2021-04-15,Industry,Market Dynamics,2.0,"[{""CharacteristicText"":""Positive demographic, ...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
1,194,2021-04-15,Industry,Intra-Industry Competition,2.0,"[{""CharacteristicText"":""Market share is consol...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
2,194,2021-04-15,Industry/Company,Technology Risk,3.0,"[{""CharacteristicText"":""Company is viewed as a...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
3,194,2021-04-15,Industry/Company,Regulatory Framework,3.0,"[{""CharacteristicText"":""Low dependecy on regul...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
4,194,2021-04-15,Company,Business Model,2.0,"[{""CharacteristicText"":""Well diversified (prod...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
5,194,2021-04-15,Company,Management & Ownership,3.0,"[{""CharacteristicText"":""Industry experiened PE...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
6,194,2021-04-15,Company,Competitive Positioning,3.0,"[{""CharacteristicText"":""Leading position or ga...",Nexi,Active,https://c4.sharepoint.com/sites/194/,https://c4.sharepoint.com/sites/194
7,235,2020-11-27,Industry,Market Dynamics,2.0,"[{""CharacteristicText"":""Low cyclicality of pro...",TalkTalk,Active,https://c4.sharepoint.com/sites/235/,https://c4.sharepoint.com/sites/235
8,235,2020-11-27,Industry,Intra-Industry Competition,4.0,"[{""CharacteristicText"":""Market pricing has not...",TalkTalk,Active,https://c4.sharepoint.com/sites/235/,https://c4.sharepoint.com/sites/235
9,235,2020-11-27,Industry/Company,Regulatory Framework,2.0,"[{""CharacteristicText"":""High dependecy on regu...",TalkTalk,Active,https://c4.sharepoint.com/sites/235/,https://c4.sharepoint.com/sites/235


In [56]:
unique_score_combinations = rms_with_fundamental_score[['CategoryGroup', 'Category', 'TaggedCharacteristics']].drop_duplicates()
# Replace all types of newlines and excessive whitespace in TaggedCharacteristics
unique_score_combinations['TaggedCharacteristics'] = unique_score_combinations['TaggedCharacteristics'].str.replace(r'[\r\n]+', ' ', regex=True)

# Function to expand TaggedCharacteristics if it's a JSON string with multiple items
def expand_tagged_characteristics(row):
    try:
        characteristics = json.loads(row['TaggedCharacteristics'])
        if isinstance(characteristics, list):
            # Replace newlines within each CharacteristicText
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': char['CharacteristicText'].replace('\r', ' ').replace('\n', ' '),
                'CharacteristicInfluence': char.get('CharacteristicInfluence', None)  # Handle missing keys
            } for char in characteristics])
        else:
            # If it's a single item or not a list, replace newlines if it's a string
            if isinstance(characteristics, str):
                characteristics = characteristics.replace('\r', ' ').replace('\n', ' ')
            return pd.DataFrame([{
                'CategoryGroup': row['CategoryGroup'],
                'Category': row['Category'],
                'TaggedCharacteristics': characteristics,
                'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
            }])
    except (json.JSONDecodeError, TypeError):
        # If parsing fails, replace newlines in the original TaggedCharacteristics
        cleaned_text = row['TaggedCharacteristics'].replace('\r', ' ').replace('\n', ' ')
        return pd.DataFrame([{
            'CategoryGroup': row['CategoryGroup'],
            'Category': row['Category'],
            'TaggedCharacteristics': cleaned_text,
            'CharacteristicInfluence': row.get('CharacteristicInfluence', None)
        }])

# Applying the function to each row and combining results
expanded_unique_score_combinations = pd.concat(
    unique_score_combinations.apply(expand_tagged_characteristics, axis=1).to_list(),
    ignore_index=True
)

# Sort, drop duplicates, and save to CSV
unique_score_combinations = expanded_unique_score_combinations.sort_values(by=['CategoryGroup', 'Category', 'CharacteristicInfluence']).drop_duplicates()
unique_score_combinations.to_csv('unique_score_combinations.csv', index=False)

**Defining the questions and instantiating the LLM**

In [57]:
# Define the questions corresponding to each column
questions_market_dynamics = {
    "Market Dynamics - a": "Does the text mention that the company is exposed to risks associated with cyclical products?",
    "Market Dynamics - b": "Does the text mention risks related to demographic or structural trends affecting the market?",
    "Market Dynamics - c": "Does the text mention risks due to seasonal volatility in the industry?"
}
questions_intra_industry_competition = {
    "Intra-Industry Competition - a": "Does the text mention that market pricing for the company's products or services is irrational or not based on fundamental factors?",
    "Intra-Industry Competition - b": "Does the text mention that the market is highly fragmented with no clear leader or that there is only one dominant leader?",
    "Intra-Industry Competition - c": "Does the text mention low barriers to entry in the industry, making it easy for new competitors to enter the market?"
}
questions_regulatory_framework = {
    "Regulatory Framework - a": "Does the text mention that the industry is subject to a high degree of regulatory scrutiny?",
    "Regulatory Framework - b": "Does the text mention a high dependency on regulation or being a beneficiary from regulation in an unstable regulatory environment?"
}
questions_technology_risk = {
    "Technology Risk - a": "Does the text mention that the industry is susceptible to rapid technological advances or innovations?",
    "Technology Risk - b": "Does the text mention that the company is perceived as a disruptor or is threatened by emerging technological changes?"
}

all_question_dicts = [
    questions_market_dynamics,
    questions_intra_industry_competition,
    questions_regulatory_framework,
    questions_technology_risk
]

# Original questions
questions_market_dynamics_original = {
    "Market Dynamics - a": "Exposure to cyclical products",
    "Market Dynamics - b": "Impact of demographic and structural trends",
    "Market Dynamics - c": "Seasonal industry volatility"
}
questions_intra_industry_competition_original = {
    "Intra-Industry Competition - a": "Market pricing has not shown to be rational",
    "Intra-Industry Competition - b": "Highly fragmented market with no clear leader or only one leader",
    "Intra-Industry Competition - c": "Low barriers to entry"
}
questions_regulatory_framework_original = {
    "Regulatory Framework - a": "Industry has high degree of regulatory scrutiny",
    "Regulatory Framework - b": "High dependency on regulation or is a beneficiary from regulation in an unstable regulatory environment"
}
questions_technology_risk_original = {
    "Technology Risk - a": "Industry susceptibility to technological advances",
    "Technology Risk - b": "Company viewed as a disruptee/threatened by technological change"
}


In [58]:
# Initialize the language model
llm = OllamaLLM(model="llama3.2")

# Check if the processed file exists; if not, process the raw data
processed_file_path = '../data/prospectuses_data_processed.csv'
raw_file_path = '../data/prospectuses_data.csv'

# Check if processed file exists
if os.path.exists(processed_file_path):
    df_LLM = pd.read_csv(processed_file_path)
else:
    print("Processed file not found. Processing raw data...")
    df_LLM = pd.read_csv(raw_file_path)
    # Filter out rows that have "failed parsing" in the Section ID column
    df_LLM = df_LLM[df_LLM['Section ID'] != "failed parsing"]

# Ensure the relevance and evidence columns are created with a compatible data type
for question_dict in all_question_dicts:
    # Iterate through each question key in the current dictionary
    for column_name in question_dict.keys():
        if column_name in df_LLM.columns:
            df_LLM[column_name] = df_LLM[column_name].astype('string')
        else:
            df_LLM[column_name] = ""

df_LLM["Prospectus ID"].unique()
# print(df.shape)

array(['235', '16', '25_1', '31', '36', '43', '44', '44_1', '57', '79',
       '107', '124', '127', '130', '130_1', '131', '131_1', '133_1',
       '136', '137', '139', '151', '151_2', '155', '155_1', '156', '162',
       '164', '166', '172', '177', '187', '199', '201', '204', '207',
       '211', '213', '219', '221', '221_1', '226', '235_1', '250', '251',
       '252', '253_1', '253_2', '258', '261', '267_1', '270', '281',
       '284', '287', '297', '299', '303_1', '306', '308', '317', '322',
       '328', '350', '352', '357', '361', '364', '367', '377', '398',
       '407', '429', '430_1', '433', '440', '441', '476_1', '477', '504',
       '505', '509', '519', '587', '587_2', '623', '624', '625', '625_1',
       '629', '630_1', '640', '641', '642', '643', '657', '661', '661_1',
       '666', '668', '887', '952_2', '977', '989', '999', '1044', '1050',
       '1052', '1053', '1065', '1074', '1096', '1108', '1122_3', '1136',
       '1145', '1162', '1162_1', '1211', '1235', '1240', '127

In [59]:
df_LLM[df_LLM["Prospectus ID"]=="16"]

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Market Dynamics - a,Market Dynamics - b,Market Dynamics - c,Parsing Error,Intra-Industry Competition - a,Intra-Industry Competition - b,Intra-Industry Competition - c,Regulatory Framework - a,Regulatory Framework - b,Technology Risk - a,Technology Risk - b
1,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.1,The Group faces significant competition in eac...,The French telecommunications market is a matu...,Highly Relevant: Various evidence throughout t...,Highly Relevant,Highly Relevant,,Highly Relevant,Highly Relevant: ...the Group also competes wi...,Highly Relevant: The exact phrases or sentence...,Highly Relevant: Several evidence are presente...,Highly Relevant,Highly Relevant: This is a highly relevant ans...,Highly Relevant: The Group also faces competit...
2,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.2,The deployment of fiber optic networks and/or ...,The Group believes that one of its major compe...,Highly Relevant: Although the Group is prepari...,Highly Relevant: deployment of fiber optic net...,Highly Relevant: The mention of 'deployment of...,,Highly Relevant: The Group believes that one o...,Highly Relevant: The deployment of fiber optic...,Highly Relevant: Subsubsection Title: The depl...,Highly Relevant: FTTH deployment by the Group'...,Highly Relevant: The Group believes that one o...,Highly Relevant: The deployment of fiber optic...,Highly Relevant: The deployment of fiber optic...
3,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.3,"Changes in competitive offerings for content, ...",The market for content is intensely competitiv...,Highly Relevant: The market for content is int...,Highly Relevant: Changes in competitive offeri...,Not Relevant,,"Highly Relevant: piracy-based video offerings,...",Highly Relevant: The market for content is int...,Not Relevant,Highly Relevant: The market for content is int...,Somewhat Relevant: None,Highly Relevant: The market for content is int...,"Highly Relevant: piracy-based video offerings,..."
4,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.4,Acquisitions and other strategic transactions ...,"Historically, the Group’s business has grown, ...",Highly Relevant: The section mentions that the...,Highly Relevant: Demographic and structural tr...,Somewhat Relevant,,Highly Relevant: The text mentions that market...,Highly Relevant: The text mentions acquisition...,"Highly Relevant: The mention of \; , \; , \; ,...",Highly Relevant: Acquisitions and other strate...,Highly Relevant: The risks associated with reg...,Not Relevant,Highly Relevant: The Group may experience diff...
5,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.5,The Group might not be able to effectively imp...,The Group has based its strategy on its vision...,Not Relevant,Highly Relevant: the development of the Group'...,Not Relevant,,Somewhat Relevant,Not Relevant,Not Relevant,Not Relevant,Somewhat Relevant,Not Relevant,"Highly Relevant: changing consumer behavior, i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.5,Risks Relating to the Notes,1.5.21,Recoveries may be limited if certain provision...,Pursuant to the terms of the Intercreditor Agr...,Not Relevant,Highly Relevant: the enforceability of intercr...,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Somewhat Relevant: If certain provisions of th...,Highly Relevant: If any of the security intere...,Not Relevant,Not Relevant
85,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.5,Risks Relating to the Notes,1.5.22,"Transfers of the Notes are restricted, which m...",The Notes are being offered and sold pursuant ...,Not Relevant,Not Relevant,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Not Relevant,Not Relevant,Not Relevant,Not Relevant
86,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.5,Risks Relating to the Notes,1.5.23,You may be unable to recover in civil proceedi...,The Issuer is incorporated under the laws of F...,Not Relevant,Not Relevant,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Not Relevant,Highly Relevant: service of process; enforceab...,Not Relevant,Not Relevant
87,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.5,Risks Relating to the Notes,1.5.24,The Notes will initially be held in book-entry...,Owners of the book-entry interests will not be...,Not Relevant,Not Relevant,Not Relevant,,Not Relevant,Not Relevant,Not Relevant,Not Relevant,Highly Relevant: you must rely on the procedur...,Not Relevant,Not Relevant


In [60]:
def extract_fields(response):
    # Remove any newlines and extra spaces
    response = ' '.join(response.strip().split())

    # Extract the Relevance field
    relevance_match = re.search(r'"Relevance"\s*:\s*"([^"]+)"', response)
    if relevance_match:
        relevance = relevance_match.group(1).strip()
    else:
        relevance = "Parsing Error"

    # Extract the Evidence field(s)
    evidence_match = re.search(r'"Evidence"\s*:\s*(.+?)(?:,?\s*"[^"]+"\s*:|\s*}$)', response)
    if evidence_match:
        evidence_str = evidence_match.group(1).strip()
        # Remove any trailing commas or braces
        evidence_str = evidence_str.rstrip(', }')
        # Split the evidence_str into individual evidence items
        # Evidence items are strings enclosed in double quotes
        evidence_items = re.findall(r'"([^"]+)"', evidence_str)
        evidence = evidence_items
    else:
        evidence = []

    return relevance, evidence


def analyze_prospectus_row_single_question(row, question):
    # System and user prompts
    system_prompt = "You are an expert in analyzing bond prospectuses and identifying specific risk factors."

    # Format the user prompt using the row's data
    prompt = f"""
{system_prompt}

For the following question and text, judge whether the text is "Highly Relevant", "Somewhat Relevant", or "Not Relevant".

Question:
{question}

Text:
Subsubsection Title: {row['Subsubsection Title']}
Subsubsection Text: {row['Subsubsection Text']}


Please provide your answer in the following JSON format:

{{
  "Relevance": "Highly Relevant", "Somewhat Relevant", or "Not Relevant",
  "Evidence": "The exact phrases or sentences from the document that support your assessment; otherwise, leave blank."
}}

Note: Only provide the JSON response without any additional text.
"""
    # Run the prompt through the model
    response = llm.invoke(input=prompt)

    # Parse the response
    try:
        # Extract the Relevance and Evidence fields
        relevance, evidence_list = extract_fields(response)
        # Join multiple evidence items into a single string
        evidence = '; '.join(evidence_list)
    except Exception as e:
        relevance = "Parsing Error"
        evidence = ""

    # Combine relevance and evidence
    if relevance in ["Highly Relevant", "Somewhat Relevant"] and evidence:
        combined_answer = f"{relevance}: {evidence}"
    elif relevance in ["Highly Relevant", "Somewhat Relevant"]:
        combined_answer = relevance
    elif relevance == "Not Relevant":
        combined_answer = "Not Relevant"
    else:
        combined_answer = "Parsing Error"

    # For debugging
    if combined_answer == "Parsing Error":
        print("Parsing Error encountered. Response was:")
        print(response)

    return combined_answer

**Run the LLM Processing**

In [61]:
import time

# Initialize counter for new rows processed
new_rows_processed = 0

# Iterate over each row in the DataFrame with a progress bar
for index, row in tqdm(df_LLM.iterrows(), total=df_LLM.shape[0], desc="Processing Rows"):
    row_processed = False  # Flag to check if we processed any new data in this row

    for question_dict in all_question_dicts:
        for column_name, question in question_dict.items():
            # Check if the answer column is already filled
            if pd.notnull(df_LLM.at[index, column_name]) and df_LLM.at[index, column_name] != "":
                # Skip processing this row for this question
                continue
            combined_answer = analyze_prospectus_row_single_question(row, question)
            df_LLM.at[index, column_name] = combined_answer
            row_processed = True  # We processed new data in this row

    if row_processed:
        new_rows_processed += 1

    # Save progress every 50 rows
    if (index + 1) % 50 == 0:
        df_LLM.to_csv(processed_file_path, index=False)
        # print(f"Progress saved at row {index + 1}")

    # After processing 10 new rows, sleep for 30 seconds
    if new_rows_processed >= 10:
        df_LLM.to_csv(processed_file_path, index=False)  # Save before sleeping
        print(f"Processed 10 new rows. Pausing for 30 seconds.")
        # time.sleep(30)
        new_rows_processed = 0  # Reset counter

# Save the final DataFrame after processing all rows
df_LLM.to_csv(processed_file_path, index=False)
print("All rows have been processed and saved.")

Processing Rows: 100%|██████████| 7952/7952 [00:54<00:00, 146.37it/s]


All rows have been processed and saved.


**Create labels in unique score combinations**

In [62]:
import string
import ast

# Read the CSV into df_labels
df_labels = pd.read_csv('./unique_score_combinations.csv')

# Clean the TaggedCharacteristics in df_labels
def clean_text(text):
    return text.replace('\n', '').replace('\r', '').strip()

df_labels['TaggedCharacteristics'] = df_labels['TaggedCharacteristics'].apply(clean_text)

# Assign letters to labels
positive_letters = list(string.ascii_uppercase)
negative_letters = list(string.ascii_lowercase)

def assign_letters(group):
    n = len(group)
    if group.name[1] == 'Positive':
        letters = positive_letters[:n]
    else:
        letters = negative_letters[:n]
    group = group.copy()
    group['letter'] = letters
    group['Label'] = group['Category'] + '.' + group['letter']
    return group

df_labels = df_labels.groupby(['Category', 'CharacteristicInfluence']).apply(assign_letters).reset_index(drop=True)
df_labels.head()

  df_labels = df_labels.groupby(['Category', 'CharacteristicInfluence']).apply(assign_letters).reset_index(drop=True)


Unnamed: 0,CategoryGroup,Category,TaggedCharacteristics,CharacteristicInfluence,letter,Label
0,Company,Business Model,High customer and/or (critical) supplier conc...,Negative,a,Business Model.a
1,Company,Business Model,Sales are not recurring in nature,Negative,b,Business Model.b
2,Company,Business Model,Weak bargaining power,Negative,c,Business Model.c
3,Company,Business Model,Project based revenues / lumpy revenues,Negative,d,Business Model.d
4,Company,Business Model,Low flexibility of cost base,Negative,e,Business Model.e


In [63]:
# Assume df_original is your original DataFrame
df = rms_with_fundamental_score.copy()

# Parse the TaggedCharacteristics column
def parse_tagged_characteristics(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

df['TaggedCharacteristics'] = df['TaggedCharacteristics'].apply(parse_tagged_characteristics)

# Explode the TaggedCharacteristics
df = df.explode('TaggedCharacteristics')

# Extract CharacteristicText and CharacteristicInfluence
df['CharacteristicText'] = df['TaggedCharacteristics'].apply(lambda x: x.get('CharacteristicText', '') if isinstance(x, dict) else '')
df['CharacteristicInfluence'] = df['TaggedCharacteristics'].apply(lambda x: x.get('CharacteristicInfluence', '') if isinstance(x, dict) else '')

# Clean the CharacteristicText
df['CharacteristicText'] = df['CharacteristicText'].apply(clean_text)

# Step 3: Merge and Group Data

# Merge with df_labels
merged_df = pd.merge(df, df_labels,
                     left_on=['Category', 'CharacteristicText', 'CharacteristicInfluence'],
                     right_on=['Category', 'TaggedCharacteristics', 'CharacteristicInfluence'],
                     how='left')

# Group by RmsId and ScoringDate and collect the labels
grouped_df = merged_df.groupby(['RmsId', 'ScoringDate'])['Label'].apply(lambda x: x.dropna().unique().tolist()).reset_index()

# The grouped_df now contains the desired labels
grouped_df.head(1)

Unnamed: 0,RmsId,ScoringDate,Label
0,1,2021-01-04,"[Market Dynamics.B, Market Dynamics.A, Market ..."


In [83]:
df_LLM[df_LLM["Prospectus ID"]=="16"].head(1)

Unnamed: 0,Prospectus ID,Original Filename,Section ID,Section Title,Subsection ID,Subsection Title,Subsubsection ID,Subsubsection Title,Subsubsection Text,Market Dynamics - a,Market Dynamics - b,Market Dynamics - c,Parsing Error,Intra-Industry Competition - a,Intra-Industry Competition - b,Intra-Industry Competition - c,Regulatory Framework - a,Regulatory Framework - b,Technology Risk - a,Technology Risk - b
1,16,Final Offerings 2020.pdf,1,RISK FACTORS,1.1,"Risks Relating to the Group’s Business, Techno...",1.1.1,The Group faces significant competition in eac...,The French telecommunications market is a matu...,Highly Relevant: Various evidence throughout t...,Highly Relevant,Highly Relevant,,Highly Relevant,Highly Relevant: ...the Group also competes wi...,Highly Relevant: The exact phrases or sentence...,Highly Relevant: Several evidence are presente...,Highly Relevant,Highly Relevant: This is a highly relevant ans...,Highly Relevant: The Group also faces competit...


In [74]:
df[df["RmsId"]==16][['RmsId', 'ScoringDate', 'CategoryGroup', 'Category',
       'TaggedCharacteristics',  'CharacteristicText',
       'CharacteristicInfluence']].head()

Unnamed: 0,RmsId,ScoringDate,CategoryGroup,Category,TaggedCharacteristics,CharacteristicText,CharacteristicInfluence
133,16,2020-11-27,Industry,Market Dynamics,{'CharacteristicText': 'Low cyclicality of pro...,Low cyclicality of product demand,Positive
133,16,2020-11-27,Industry,Market Dynamics,{'CharacteristicText': 'Limited seasonaility a...,Limited seasonaility and stable working capital,Positive
134,16,2020-11-27,Industry,Intra-Industry Competition,{'CharacteristicText': 'Market share is consol...,Market share is consolidated with top 3-5 play...,Positive
134,16,2020-11-27,Industry,Intra-Industry Competition,{'CharacteristicText': 'Market pricing has not...,Market pricing has not shown to be rational,Negative
135,16,2020-11-27,Industry/Company,Technology Risk,{'CharacteristicText': 'Industry is susceptibl...,Industry is susceptible to technological advances,Negative


In [79]:
grouped_df[grouped_df["RmsId"]==16].head(1).values

array([[16, datetime.date(2020, 11, 27),
        list(['Market Dynamics.B', 'Market Dynamics.C', 'Intra-Industry Competition.A', 'Intra-Industry Competition.a', 'Technology Risk.a', 'Management & Ownership.c', 'Competitive Positioning.a'])]],
      dtype=object)

In [86]:
# The labels corresponding to the specified columns
specified_columns = [
    'Market Dynamics - a', 'Market Dynamics - b', 'Market Dynamics - c',
    'Intra-Industry Competition - a', 'Intra-Industry Competition - b', 'Intra-Industry Competition - c',
    'Regulatory Framework - a', 'Regulatory Framework - b',
    'Technology Risk - a', 'Technology Risk - b'
]

# Map df_LLM columns to labels in the format 'Category.letter'
def column_to_label(col_name):
    return col_name.replace(' - ', '.').strip()

label_mapping = {col: column_to_label(col) for col in specified_columns}
all_labels = list(label_mapping.values())

# Step 2: Build a Mapping from RmsId to Analyst-Assigned Labels

# Filter grouped_df to include only the labels of interest
grouped_df['Analyst_Labels'] = grouped_df['Label'].apply(lambda labels: [label for label in labels if label in all_labels])
analyst_labels_dict = dict(zip(grouped_df['RmsId'], grouped_df['Analyst_Labels']))

# Step 3: Process df_LLM to Extract LLM-Assigned Labels

# Function to extract LLM-assigned labels per Prospectus ID
def get_LLM_labels_for_prospectus(df, label_columns, label_mapping):
    assigned_labels = set()
    for col in label_columns:
        label = label_mapping[col]
        # Check if any row has 'Highly Relevant' or 'Somewhat Relevant' for this label
        relevant = df[col].astype(str).str.startswith('Highly Relevant').any()

        if relevant:
            assigned_labels.add(label)
    return assigned_labels

# Build a dictionary mapping Prospectus ID to LLM-assigned labels
LLM_labels_dict = {}

for prospectus_id, group in df_LLM.groupby('Prospectus ID'):
    assigned_labels = get_LLM_labels_for_prospectus(group, specified_columns, label_mapping)
    LLM_labels_dict[prospectus_id] = assigned_labels

# Step 4: Map Prospectus ID to RmsId

def get_RmsId_from_ProspectusID(prospectus_id):
    return int(str(prospectus_id).split('_')[0])

# Step 5: Construct DataFrame for Confusion Matrix Calculation

data = []

for prospectus_id, llm_labels in LLM_labels_dict.items():
    rms_id = get_RmsId_from_ProspectusID(prospectus_id)
    analyst_labels = set(analyst_labels_dict.get(rms_id, []))
    for label in all_labels:
        llm_assigned = label in llm_labels
        analyst_assigned = label in analyst_labels
        data.append({
            'RmsId': rms_id,
            'Prospectus ID': prospectus_id,
            'Label': label,
            'LLM_Assigned': llm_assigned,
            'Analyst_Assigned': analyst_assigned
        })

df_confusion = pd.DataFrame(data)

# Step 6: Calculate the Confusion Matrix and Metrics

# Function to compute metrics
def compute_metrics(tp, fp, fn, tn):
    precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
    recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else np.nan
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    return precision, recall, f1_score, accuracy

# Compute per-label confusion matrix and metrics
confusion_matrix = df_confusion.groupby('Label').apply(
    lambda x: pd.Series({
        'TP': ((x['LLM_Assigned'] == True) & (x['Analyst_Assigned'] == True)).sum(),
        'FP': ((x['LLM_Assigned'] == True) & (x['Analyst_Assigned'] == False)).sum(),
        'FN': ((x['LLM_Assigned'] == False) & (x['Analyst_Assigned'] == True)).sum(),
        'TN': ((x['LLM_Assigned'] == False) & (x['Analyst_Assigned'] == False)).sum()
    })
).reset_index()

confusion_matrix[['Precision', 'Recall', 'F1 Score', 'Accuracy']] = confusion_matrix.apply(
    lambda row: compute_metrics(row['TP'], row['FP'], row['FN'], row['TN']), axis=1, result_type='expand'
)

# Compute overall confusion matrix and metrics
TP = ((df_confusion['LLM_Assigned'] == True) & (df_confusion['Analyst_Assigned'] == True)).sum()
FP = ((df_confusion['LLM_Assigned'] == True) & (df_confusion['Analyst_Assigned'] == False)).sum()
FN = ((df_confusion['LLM_Assigned'] == False) & (df_confusion['Analyst_Assigned'] == True)).sum()
TN = ((df_confusion['LLM_Assigned'] == False) & (df_confusion['Analyst_Assigned'] == False)).sum()

overall_precision, overall_recall, overall_f1, overall_accuracy = compute_metrics(TP, FP, FN, TN)

overall_confusion = pd.DataFrame({
    'Metric': ['TP', 'FP', 'FN', 'TN', 'Precision', 'Recall', 'F1 Score', 'Accuracy'],
    'Value': [TP, FP, FN, TN, overall_precision, overall_recall, overall_f1, overall_accuracy]
})

# Step 7: Display the Results

print("Per-Label Confusion Matrix with Metrics:")
print(confusion_matrix)

print("\nOverall Confusion Matrix and Metrics:")
print(overall_confusion)

Per-Label Confusion Matrix with Metrics:
                          Label  TP   FP  FN  TN  Precision    Recall  \
0  Intra-Industry Competition.a  25  105   0   6   0.192308  1.000000   
1  Intra-Industry Competition.b  34   89   4   9   0.276423  0.894737   
2  Intra-Industry Competition.c  20  100   1  15   0.166667  0.952381   
3             Market Dynamics.a  30  100   1   5   0.230769  0.967742   
4             Market Dynamics.b  26  110   0   0   0.191176  1.000000   
5             Market Dynamics.c   0  124   0  12   0.000000       NaN   
6        Regulatory Framework.a  31  103   2   0   0.231343  0.939394   
7        Regulatory Framework.b  14  121   1   0   0.103704  0.933333   
8             Technology Risk.a  27   83   7  19   0.245455  0.794118   
9             Technology Risk.b   0  115   0  21   0.000000       NaN   

   F1 Score  Accuracy  
0  0.322581  0.227941  
1  0.422360  0.316176  
2  0.283688  0.257353  
3  0.372671  0.257353  
4  0.320988  0.191176  
5       NaN

  confusion_matrix = df_confusion.groupby('Label').apply(


In [87]:
confusion_matrix

Unnamed: 0,Label,TP,FP,FN,TN,Precision,Recall,F1 Score,Accuracy
0,Intra-Industry Competition.a,25,105,0,6,0.192308,1.0,0.322581,0.227941
1,Intra-Industry Competition.b,34,89,4,9,0.276423,0.894737,0.42236,0.316176
2,Intra-Industry Competition.c,20,100,1,15,0.166667,0.952381,0.283688,0.257353
3,Market Dynamics.a,30,100,1,5,0.230769,0.967742,0.372671,0.257353
4,Market Dynamics.b,26,110,0,0,0.191176,1.0,0.320988,0.191176
5,Market Dynamics.c,0,124,0,12,0.0,,,0.088235
6,Regulatory Framework.a,31,103,2,0,0.231343,0.939394,0.371257,0.227941
7,Regulatory Framework.b,14,121,1,0,0.103704,0.933333,0.186667,0.102941
8,Technology Risk.a,27,83,7,19,0.245455,0.794118,0.375,0.338235
9,Technology Risk.b,0,115,0,21,0.0,,,0.154412


In [88]:
overall_confusion

Unnamed: 0,Metric,Value
0,TP,207.0
1,FP,1050.0
2,FN,16.0
3,TN,87.0
4,Precision,0.164678
5,Recall,0.928251
6,F1 Score,0.27973
7,Accuracy,0.216176
