In [24]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
import os
import os, getpass
import json


In [3]:
from ibm_watsonx_ai import Credentials

credentials = Credentials(
    url="https://us-south.ml.cloud.ibm.com",
    api_key=getpass.getpass("Please enter your watsonx.ai api key (hit enter): "),
)

In [4]:
try:
    project_id = os.environ["PROJECT_ID"]
except KeyError:
    project_id = input("Please enter your project_id (hit enter): ")

In [5]:
from ibm_watsonx_ai import APIClient

api_client = APIClient(credentials=credentials, project_id=project_id)

In [6]:
loader = DirectoryLoader(
    r"C:\Users\USER\Downloads\Mestrado_Dissertação\Artigos\Eligibility",
    glob="*.pdf",
    loader_cls=PyPDFLoader
)
documents = loader.load()

In [7]:
unique_articles = set(doc.metadata["source"] for doc in documents)

print(f"📄 Total articles: {len(unique_articles)}")


📄 Total articles: 170


In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma


#Divide the text in chunks 
text_splitter = CharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=256 
)
texts = text_splitter.split_documents(documents)

print(f"✅ Total chunks: {len(texts)}")
print(f"📄 Example:\n{texts[0].page_content[:1000]}...")


✅ Total chunks: 3233
📄 Example:
Vol.:(0123456789)1 3
Journal of Business Ethics (2020) 167:111–125 
https://doi.org/10.1007/s10551-019-04161-4
ORIGINAL PAPER
Stakeholder Relationship Capability and Firm Innovation: 
A Contingent Analysis
Wei Jiang1 · Aric Xu Wang2  · Kevin Zheng Zhou2  · Chuang Zhang3
Received: 7 September 2018 / Accepted: 11 April 2019 / Published online: 23 April 2019 
© Springer Nature B.V. 2019
Abstract
Despite the growing importance of stakeholder management, few studies have empirically examined the influence of stake-
holder relationship capability (SRC) on firm innovation, especially in emerging economies. This study investigates how SRC 
relates to firm innovation in the presence of governmental intervention and in combination with firm-level characteristics. 
Using a survey and multiple secondary datasets on the listed Chinese firms, our findings indicate that SRC is positively 
associated with firm innovation. Moreover, advanced legal development and high-te

In [9]:
api_client.foundation_models.EmbeddingModels.show()

{'GRANITE_EMBEDDING_107M_MULTILINGUAL': 'ibm/granite-embedding-107m-multilingual', 'GRANITE_EMBEDDING_278M_MULTILINGUAL': 'ibm/granite-embedding-278m-multilingual', 'SLATE_125M_ENGLISH_RTRVR': 'ibm/slate-125m-english-rtrvr', 'SLATE_125M_ENGLISH_RTRVR_V2': 'ibm/slate-125m-english-rtrvr-v2', 'SLATE_30M_ENGLISH_RTRVR': 'ibm/slate-30m-english-rtrvr', 'SLATE_30M_ENGLISH_RTRVR_V2': 'ibm/slate-30m-english-rtrvr-v2', 'MULTILINGUAL_E5_LARGE': 'intfloat/multilingual-e5-large', 'ALL_MINILM_L12_V2': 'sentence-transformers/all-minilm-l12-v2', 'ALL_MINILM_L6_V2': 'sentence-transformers/all-minilm-l6-v2'}


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader


splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
texts = splitter.split_documents(documents)

#Embeddings with e5-large
embedding_function = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    encode_kwargs={"normalize_embeddings": True}
)

#Index with ChromaDB
db = Chroma.from_documents(
    documents=texts,
    embedding=embedding_function,
    persist_directory="./test_db"
)


In [12]:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods

generate_params = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.TEMPERATURE:       0,
    GenParams.MIN_NEW_TOKENS:    1,
    GenParams.MAX_NEW_TOKENS:    1000,
    GenParams.STOP_SEQUENCES: ["<|endoftext|>"]

}

In [None]:

model_id = "deepseek-ai/deepSeek-r1-distill-llama-70b"

In [None]:
from langchain_ibm import WatsonxLLM

deepseek = WatsonxLLM(
    model_id=model_id,
    url=credentials.get("url"),
    apikey=credentials.get("apikey"),
    project_id=project_id,
    params=generate_params
)


In [None]:
#Generate Q&A pairs for each document
qa_pairs = []
for i, doc in enumerate(documents):
    content = doc.page_content[:3000]  # Limit to prevent overflow
    prompt = f'''You are a domain expert in stakeholder management.

Your task is to read the content provided below and generate 25 to 30 high-quality question–answer pairs. Focus on the main findings and insights related to stakeholder management in each article.

Each question should:
- Be clear and relevant to the article’s content.
- Target key concepts, practices, challenges, or implications of stakeholder management.
- Encourage understanding of core themes, not trivial details.

Each answer should:
- Be concise, accurate, and informative.
- Reflect the article’s specific viewpoint or evidence where applicable.

Ensure the Q&A pairs are diverse in focus and suitable for use in training or assessment contexts.

Content:
"""
{content}
"""

Output format (JSON):
[{{"question": "...", "answer": "..."}}]''' 

    try:
        result = deepseek.generate_text(prompt)
        raw_pairs = json.loads(result)
        formatted_pairs = [
            {"input": p["question"], "output": p["answer"]}
            for p in raw_pairs if "question" in p and "answer" in p
    ]
        qa_pairs.extend(formatted_pairs)
    except Exception as e:
        pass

# Step 7: Save Q&A to JSON
with open("qa_pairs_stakeholder.json", "w", encoding="utf-8") as f:
    json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

print(f"🎯 Total Q&A pairs generated: {len(qa_pairs)}")

🎯 Total Q&A pairs generated: 5000
