## Import libraries

In [2]:
import pandas as pd
import transformers
import torch
import time
import numpy as np
from tqdm import tqdm
try:
  import PyPDF2
except:
  !pip install PyPDF2
import PyPDF2
import os
try:
  from groq import Groq
except:
  !pip install groq
from groq import Groq
import math

## Document transformation

In [None]:
# We need to convert the .pdf files into .txt files
pdfs = [file for file in os.listdir("Reports (pdf)") if not file.startswith(".") ]
# We initialize a dictionary to store the text of each pdf
documents = {} 

for pdf in pdfs:
    pdf_path = "Reports (pdf)/" + pdf
    txt_file = pdf[:-3] + "txt" # We define the name of the .txt file
    txt_path = "Reports (txt)/" + txt_file # We define the path for the .txt file
    
    # We check if the txt file has already been converted
    if txt_file in os.listdir("Reports (txt)"):
        print("File already transformed to txt")
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
    # If not, we convert the .pdf file into a .txt file
    else: 
        text = ""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in tqdm(reader.pages):
                text += page.extract_text() + "PAGE END"          
        text = text.replace('\n', '')
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(text)
    # We store the texts in the documents dictionary
    documents[pdf] = text

File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt
File already transformed to txt


In [None]:
# Check of the documents
documents.keys()

dict_keys(['Stellantis.pdf', 'Shell.pdf', 'AIG.pdf', 'Siemens.pdf', 'SAP.pdf', 'Maersk.pdf', 'Volkswagen.pdf', 'Nike.pdf', 'Aramco.pdf', 'NovoNordisk.pdf', 'CocaCola.pdf', 'Toyota.pdf', 'Tesla.pdf', 'Boeing.pdf', 'Samsung.pdf', 'Nestle.pdf', 'Apple.pdf', 'Eni.pdf', 'BASF.pdf', 'P&G.pdf', 'Nvidia (sustainability).pdf', 'Alibaba.pdf', 'TotalEnergies.pdf', 'ExxonMobil_P1.pdf', 'LVMH.pdf'])

In [None]:
# Check of the number of words in each document
total_words = 0
for i, doc in enumerate(documents.values()):
    words = doc.split()
    print(f"Document {i}: {len(words)} tokens")
    total_words += len(words)

Document 0: 222011 tokens
Document 1: 312960 tokens
Document 2: 130543 tokens
Document 3: 93710 tokens
Document 4: 164942 tokens
Document 5: 110234 tokens
Document 6: 319521 tokens
Document 7: 56932 tokens
Document 8: 105256 tokens
Document 9: 91239 tokens
Document 10: 94012 tokens
Document 11: 97541 tokens
Document 12: 59441 tokens
Document 13: 42628 tokens
Document 14: 138173 tokens
Document 15: 16249 tokens
Document 16: 50729 tokens
Document 17: 256985 tokens
Document 18: 89544 tokens
Document 19: 52813 tokens
Document 20: 14549 tokens
Document 21: 177211 tokens
Document 22: 407312 tokens
Document 23: 2516 tokens
Document 24: 28986 tokens


In [None]:
# Check of the total number of words (all documents)
total_words

3136037

## Model

In [None]:
# Possible LLM models to use
models = pd.DataFrame(columns=['Model', 'Developer', 'Type'])

models.loc[0] = ['llama-3.3-70b-versatile', 'Meta', 'Production']
models.loc[1] = ['llama-3.1-8b-instant', 'Meta', 'Production']
models.loc[2] = ['llama-guard-3-8b', 'Meta', 'Production']
models.loc[3] = ['llama3-70b-8192', 'Meta', 'Production']
models.loc[4] = ['llama3-8b-8192', 'Meta', 'Production']
models.loc[5] = ['gemma2-9b-it', 'Google', 'Production']
models.loc[6] = ['whisper-large-v3', 'OpenAI', 'Production']
models.loc[7] = ['whisper-large-v3-turbo', 'OpenAI', 'Production']
models.loc[8] = ['distil-whisper-large-v3-en', 'HuggingFace', 'Production']
models.loc[9] = ['allam-2-7b', 'Saudi Data and AI Authority (SDAIA)', 'Preview']
models.loc[10] = ['deepseek-r1-distill-llama-70b', 'DeepSeek', 'Preview']
models.loc[11] = ['meta-llama/llama-4-maverick-17b-128e-instruct', 'Meta', 'Preview']
models.loc[12] = ['meta-llama/llama-4-scout-17b-16e-instruct', 'Meta', 'Preview']
models.loc[13] = ['mistral-saba-24b', 'Mistral', 'Preview']
models.loc[14] = ['playai-tts', 'Playht, Inc', 'Preview']
models.loc[15] = ['playai-tts-arabic', 'Playht, Inc', 'Preview']
models.loc[16] = ['qwen-qwq-32b', 'Alibaba Cloud', 'Preview']

In [4]:
models

Unnamed: 0,Model,Developer,Type
0,llama-3.3-70b-versatile,Meta,Production
1,llama-3.1-8b-instant,Meta,Production
2,llama-guard-3-8b,Meta,Production
3,llama3-70b-8192,Meta,Production
4,llama3-8b-8192,Meta,Production
5,gemma2-9b-it,Google,Production
6,whisper-large-v3,OpenAI,Production
7,whisper-large-v3-turbo,OpenAI,Production
8,distil-whisper-large-v3-en,HuggingFace,Production
9,allam-2-7b,Saudi Data and AI Authority (SDAIA),Preview


In [None]:
# We define the prompts to be passed to the LLM
system_message_full = """
You are an expert in geopolitical risk analysis. Given a text extracted from a company document, give insights and return a dictionary containing:
1.⁠ Key 1: the string “Topics”. Value 1: a list containing ⁠key geopolitical topics identified in the document
2.⁠ Key 2: the string “Scores”. Value 2: a list containing geopolitical risk scores from 0 (very low risk) to 10 (extremely high risk), referring to the topics identified in point 1; you can use decimals while giving the scores.
3.⁠ ⁠Key 3: the string “Why?”. Value 3: a list containing brief justifications for the given scores.
4. Key 4: the string "Final Score". Value 4: a final score from 0 to 10 for the company based on the geopolitical risks identified

Your work will be evaluated: if poorly done, you will be fired!
"""

system_message_parts = """
You are an expert in geopolitical risk analysis. You will be given separated parts of a company report, to analyze one part at a time. Give insights and return a dictionary containing:
1.⁠ Key 1: the string “Topics”. Value 1: a list containing ⁠key geopolitical topics identified in the document
2.⁠ Key 2: the string “Scores”. Value 2: a list containing geopolitical risk scores from 0 (very low risk) to 10 (extremely high risk), referring to the topics identified in point 1; you can use decimals while giving the scores.
3.⁠ ⁠Key 3: the string “Why?”. Value 3: a list containing brief justifications for the given scores.

When the document is finished, you will be given your insights from all parts, in order for you to deliver an output with all the topics, relative scores and justifications, and add a fourth element to the dictionary with a final score from 0 to 10 for the company based on the geopolitical risks identified.

Your work will be evaluated: if poorly done, you will be fired!
"""

In [None]:
# API keys (personal, to be replaced with your own)
with open('api_keys.txt', 'r') as f:
    lines = f.readlines()
    api_keys = [line.strip() for line in lines]

api_keys

In [None]:
# We initialize two lists to keep track of the documents already processed
done = []
done_final = []

In [None]:
# We initialize the Groq client, which is the API to call the LLM
client = Groq(
    api_key=api_keys[0]
)

# Documents to be processed
doc_paths = ["Reports (txt)/" + file for file in os.listdir("Reports (txt)") if not file.startswith(".")]

for document in doc_paths:
    document_name = document.split("/")[-1][:-4] # remove .txt
    print(f'Company name: {document_name}')
    if document_name in done: # Check if the document has already been processed. In case it has, we skip it
        continue
    doc_len = len(open(document, 'r', encoding='utf-8').read().split())
    if doc_len > 25000: # If the document contains more than 25000 words, we split it into parts
        n_steps = math.ceil(doc_len / 25000)
        indexes = list(np.linspace(0, doc_len, n_steps))
        indexes = [round(item) for item in indexes]
        print("Indexes:", indexes) # Parts of the document
        for i in range(len(indexes)-1):
            start = indexes[i]
            end = indexes[i+1]
            with open(document, 'r', encoding='utf-8') as f: 
                document_text = f.read()[start:end]
            user_message = f"This is the {int(i + 1)} part of the company report. {document_text}" # We define the user message for the LLM (for big documents)
            # Call the LLM
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_message_parts},
                    {"role": "user", "content": user_message},
                    ],
                model="llama-3.3-70b-versatile", # Model to be used
                stream=False,
            )
            with open(f"LLM Output/{document_name}.txt", 'a') as f:
                f.write(chat_completion.choices[0].message.content) # We ave the output of the LLM adding it to the relative file
            print(chat_completion.choices[0].message.content) # Check of the output generated by the LLM
    else: # If the document contains less than 25000 words, we process it as a whole
        with open(document, 'r', encoding='utf-8') as f:
            document_text = f.read()
        user_message = f"This is the company report. {document_text}" # We define the user message for the LLM (for small documents)
        # Call the LLM
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_message_full},
                {"role": "user", "content": user_message},
                ],
            model="llama-3.3-70b-versatile", # Model to be used
            stream=False,
        )
        with open(f"LLM Output/{document_name}.txt", 'w') as f:
            f.write(chat_completion.choices[0].message.content) # We save the output of the LLM in a file
        print(chat_completion.choices[0].message.content) # Check of the output generated by the LLM
    done.append(document_name) # We add the document to the list of processed documents
    time.sleep(60) # We top the algorithm for 60 seconds to avoid overloading the model (due to the API processing limitations)

Company name: Nike
Company name: Aramco
Company name: Maersk
Company name: Volkswagen
Company name: Tesla
Indexes: [0, 29720, 59441]
Based on the provided company report, I have identified the following key geopolitical topics, risk scores, and justifications:

```python
result = {
    "Topics": [
        "Global supply chain disruptions",
        "Government regulations and incentives",
        "International trade and tariffs",
        "Competition in the electric vehicle market",
        "Access to raw materials"
    ],
    "Scores": [
        6.5,  # Global supply chain disruptions
        7.2,  # Government regulations and incentives
        5.8,  # International trade and tariffs
        8.1,  # Competition in the electric vehicle market
        4.9  # Access to raw materials
    ],
    "Why": [
        "The company's global supply chain is vulnerable to disruptions, which could impact production and sales.",
        "Government regulations and incentives can significantly impact

In [None]:
# We check the size of the documents: if they contain more than 25000 words, they have been processed in parts by the LLM.
doc_size = pd.DataFrame()
i = 0
for document in doc_paths:
    document_name = document.split("/")[-1][:-4]
    doc_len = len(open(document, 'r', encoding='utf-8').read().split())
    if doc_len > 25000: # If the document contains more than 25000 words, we label it as a "Big Document"
        doc_size[i] = [document_name, 'Big Document']
        i += 1
    else: # If the document contains less than 25000 words, we label it as a "Small Document"
        doc_size[i] = [document_name, 'Small Document']
        i += 1
doc_size = doc_size.T
doc_size.columns = ['Document Name', 'Size']
doc_size

Unnamed: 0,Document Name,Size
0,Nike,Big Document
1,Aramco,Big Document
2,Maersk,Big Document
3,Volkswagen,Big Document
4,Tesla,Big Document
5,Boeing,Big Document
6,Samsung,Big Document
7,NovoNordisk,Big Document
8,CocaCola,Big Document
9,Toyota,Big Document


In [None]:
# If the document was too long, the LLM was called multiple times. Now we need to process the final output of the LLM.
for doc in doc_size['Document Name']:
    # We check if the document has already been processed. In case it has, we skip it
    if doc in done_final:
        continue
    # We get the size of the document from the doc_size dataframe
    doc_s = doc_size[doc_size['Document Name'] == doc]['Size'].values[0]
    # If the document was too long, we want to feed the partial outputs to the LLM to get a final one
    if doc_s == 'Big Document':
        with open(f'LLM Output/{doc}.txt', 'r', encoding='utf-8') as f:
            document_text = f.read()
        user_message_final = f"These are your insights on all the parts of the company document. {document_text}" # We define the final user message for the LLM (for big documents)
        chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": system_message_parts},
                        {"role": "user", "content": user_message_final},
                        ],
                    model="llama-3.3-70b-versatile",
                    stream=False,
                )
        with open(f"LLM Output (final)/{doc}_final.txt", 'w') as f:
            f.write(chat_completion.choices[0].message.content)
        print(chat_completion.choices[0].message.content)
        time.sleep(5) # Stop the algorithm for 5 seconds to avoid overloading the model (due to the API processing limitations)
    # If the document was small, we can just copy the original output of the LLM in a new file
    else:
        with open(f'LLM Output/{doc}.txt', 'r', encoding='utf-8') as f, open(f'LLM Output (final)/{doc}_final.txt', 'w') as f_out:
            f_out.write(f.read())
        print(f"Document {doc} is small, no need to process it again.")
    done_final.append(doc) # Add the document to the list of processed documents

Based on the provided text, I will now analyze all the parts of the company report and provide a dictionary with the required information.

After analyzing all the parts of the report, I have identified the following geopolitical topics, scores, and justifications:

* **Topics:** 
  1. Saudi Aramco's growth strategy
  2. Energy transition and sustainability
  3. Global energy demand and supply
  4. Geopolitical risks in the energy sector
  5. Competition in the global energy market
  6. Global oil demand and supply
  7. Geopolitical events and their impact on the energy market
  8. Energy transition and lower-carbon initiatives
  9. Upstream and Downstream operations
  10. Localization and national champion development
  11. Energy transition and climate change
  12. Global energy demand and market volatility
  13. Investment in new energies and low-carbon technologies
  14. Geopolitical risks in the Middle East and Saudi Arabia
  15. International trade and economic sanctions
  16. Re