In [None]:
#!pip install -q langchain openai chromadb

In [1]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.document_loaders import CSVLoader
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA, MapReduceChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
import os
import tiktoken
import pandas as pd

In [2]:
USE_GPT_3_5_TURBO = False
USE_GPT_4 = True
USE_ZERO_SHOT = False
USE_FEW_SHOT = True
NUM_FEW_SHOT_EXAMPLES = 5
USE_RAG_FEW_SHOT = True

DATASET_PATH = 'Research/FNP_2023/'
OUTPUT_DIR = 'Research/FNP_2023/'
PRACTICE_SPANISH_DIR = 'practice_spanish'
PRACTICE_ENGLISH_DIR = 'practice-english'
TRAIN_SPANISH_DIR = 'training_spanish'
TRAIN_ENGLISH_DIR = 'training-english'
IS_PRACTICE = True
IS_TRAINING= False
IS_ENGLISH = False
IS_SPANISH = True

GPT4_DEPLOYMENT_NAME = ''
GPT_3_5_DEPLOYMENT_NAME = ''

In [4]:
if USE_GPT_4:
    model = AzureChatOpenAI(temperature=0,deployment_name=GPT4_DEPLOYMENT_NAME)
elif USE_GPT_3_5_TURBO:
    model = AzureChatOpenAI(temperature=0,deployment_name=GPT_3_5_DEPLOYMENT_NAME)
else:
    raise Exception('Model not supported.')

In [6]:
response_schemas = [
    ResponseSchema(name="cause", description="extract cause from the user given text"),
    ResponseSchema(name="effect", description="extract effect from the user given text")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='cause', description='extract cause from the user given text', type='string'), ResponseSchema(name='effect', description='extract effect from the user given text', type='string')])

In [7]:
## zero shot prompt tempelate

zero_shot_template = """
        The task is to extract the cause and effect from the given financial {text}. Don't provide anything except cause and effect.
        
        Instructions:
        - Answer the question based only on the text provided. Extracted cause and effect should be part of given input financial text.
        - Only one set of Cause and effect has to be given as output for each single input.
        - Output language has to be the same as the input language.

        {format_instructions}
        """


In [8]:
### Few shot template

few_shot_template = """
        The task is to extract the cause and effect from the given financial {text}. Don't provide anything except cause and effect. 
        
        Here are few examples:

        {few_shot_examples}
        
        
        Instructions:
        - Answer the question based only on the text provided. Extracted cause and effect should be part of given input financial text.
        - Only one set of Cause and effect has to be given as output for each single input.
        - Output language has to be the same as the input language.

        {format_instructions}
        """

In [21]:
### Few shot template: Spanish

few_shot_template = """
        La tarea es extraer la causa y el efecto del {text} financiero dado. No proporciones nada excepto causa y efecto. 
        
        Aquí hay algunos ejemplos:

        {few_shot_examples}
        
        
        Instrucciones:
        - Responda la pregunta basándose únicamente en el texto proporcionado. La causa y el efecto extraídos deben ser parte del texto financiero de entrada dado.
        - Solo se debe dar un conjunto de Causa y efecto como salida para cada entrada individual.
        - El idioma de salida tiene que ser el mismo que el de entrada.

        {format_instructions}
        """

In [22]:
def build_prompt():
    format_instructions = output_parser.get_format_instructions()
    if USE_ZERO_SHOT:
        print('Zero shot setting')
        prompt_template = zero_shot_template
        input_vars = ["text"]
    elif USE_FEW_SHOT:
        print('Few shot setting')
        prompt_template = few_shot_template
        input_vars = ["text", "few_shot_examples"]
    else:
        raise Exception('Learning method not supported!')
        
        
    prompt = ChatPromptTemplate(
        messages=[
            HumanMessagePromptTemplate.from_template(prompt_template)
        ],
        input_variables=input_vars,
        partial_variables={"format_instructions": format_instructions}
    )
    return prompt

In [23]:
def get_cause_effect(text, prompt, few_shot_examples=''):
    if USE_ZERO_SHOT:
        _input = prompt.format_prompt(text=text)

    elif USE_FEW_SHOT:
        _input = prompt.format_prompt(text=text, few_shot_examples=few_shot_examples)

    output = model(_input.to_messages())
    response = output_parser.parse(output.content)

    return response

In [11]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# Check if the flag for using the RAG few-shot setup is enabled
if USE_RAG_FEW_SHOT :
    if IS_ENGLISH:
        training_df = pd.read_csv('Research/FNP_2023/FinCausal/dataset/training_subtask_en.csv', sep=';')
    if IS_SPANISH:
        training_df = pd.read_csv('Research/FNP_2023/FinCausal/dataset/training_subtask_es.csv', sep=';')
        
    training_df = training_df[['Text', 'Cause', 'Effect']]
    loader = DataFrameLoader(training_df, page_content_column='Text')
    loaded_data = loader.load()

    # Create embeddings using the OpenAIEmbeddings class
    #embeddings = OpenAIEmbeddings(deployment="embeddings_model", chunk_size=1)
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create an index creator using the VectorstoreIndexCreator class
    #index_creator = VectorstoreIndexCreator(embedding=embeddings)

    # Create a searchable index of documents using the loaded data
    #docsearch = index_creator.from_loaders([loader])
    #doc_search = Chroma.from_documents(loaded_data, OpenAIEmbeddings(deployment = "embeddings_model", chunk_size=1))
    doc_search = Chroma.from_documents(loaded_data, embeddings)


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def get_simialr_examples(content) : 
    # NUM_FEW_SHOT_EXAMPLES
    query_response = doc_search.similarity_search(content, k=NUM_FEW_SHOT_EXAMPLES)
    return query_response

In [24]:
prompt = build_prompt()

Few shot setting


In [26]:
if IS_ENGLISH:
    practice_df = pd.read_csv('Research/FNP_2023/FinCausal/dataset/practice-english/practice_subtask_en.csv', sep=';')
if IS_SPANISH:
    practice_df = pd.read_csv('Research/FNP_2023/FinCausal/dataset/practice_spanish/practice_subtask_es.csv', sep=';')

96

In [27]:
def get_few_shot_examples(content):
    few_shot_example = ''
    response = get_simialr_examples(content)
    for i in range(len(response)):
        text = response[i].page_content
        cause = response[i].metadata['Cause']
        effect = response[i].metadata['Effect']
        few_shot_example = few_shot_example + 'Example ' + str(i+1) + ':\n'
        few_shot_example = few_shot_example + 'Text: ' + text + '\n'
        few_shot_example = few_shot_example + 'cause: ' + cause + '\n'
        few_shot_example = few_shot_example + 'effect: ' + effect + '\n'
    return few_shot_example

In [None]:
cause_list = []
effect_list = []
index_list = []
offset = 0
for i in range(len(practice_df)):
    print(f"Processing Input {i+offset}")
    input_text = practice_df['Text'][i+offset]
    cause = practice_df['Cause'][i+offset]
    effect = practice_df['Effect'][i+offset]
    
    if USE_RAG_FEW_SHOT:
        few_shot_examples = get_few_shot_examples(input_text)
        #print(few_shot_examples)
        response = get_cause_effect(input_text, prompt,few_shot_examples)
        #print(response)
    else:
        response = get_cause_effect(input_text, prompt)
    cause_list.append(response['cause'])
    effect_list.append(response['effect'])
    index_list.append(practice_df['Index'][i+offset]) 

In [None]:
fincaual_df = pd.DataFrame(columns=['Index', 'Text', 'Cause', 'Effect'])
fincaual_df['Index'] = index_list
fincaual_df['Text'] = practice_df['Text'].values
fincaual_df['Cause'] = cause_list
fincaual_df['Effect'] = effect_list
fincaual_df.head()

In [32]:
if IS_ENGLISH:
    fincaual_df.to_csv(os.path.join('Research/FNP_2023/FinCausal/output/llm/OpenAI/practice-english', 'few_shot_'+str(NUM_FEW_SHOT_EXAMPLES)+'_'+'practice_subtask_results_en.csv'), sep=';', escapechar='"', index=False)
if IS_SPANISH:
    fincaual_df.to_csv(os.path.join('Research/FNP_2023/FinCausal/output/llm/OpenAI/practice_spanish', 'few_shot_'+str(NUM_FEW_SHOT_EXAMPLES)+'_'+'practice_subtask_results_es.csv'), sep=';', escapechar='"', index=False)