In [120]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import AzureChatOpenAI

import os
import pandas as pd

In [121]:
USE_GPT_3_5_TURBO = False
USE_GPT_4 = True
USE_ZERO_SHOT = False
USE_FEW_SHOT = True
NUM_FEW_SHOT_EXAMPLES = 15
SAMPLE_EXAMPLES = False

DATASET_PATH = '../../../FinCausal/dataset/'
OUTPUT_DIR = '../../../FinCausal/output/llm/OpenAI'
PRACTICE_SPANISH_DIR = 'practice_spanish'
PRACTICE_ENGLISH_DIR = 'practice-english'
TRAIN_SPANISH_DIR = 'training_spanish'
TRAIN_ENGLISH_DIR = 'training-english'
IS_PRACTICE = True
IS_TRAINING= False
IS_ENGLISH = True
IS_SPANISH = False

In [123]:
if USE_GPT_4:
    model = AzureChatOpenAI(temperature=0,deployment_name="")
elif USE_GPT_3_5_TURBO:
    model = AzureChatOpenAI(temperature=0,deployment_name="")
else:
    raise Exception('Model not supported.')

In [124]:
response_schemas = [
    ResponseSchema(name="cause", description="extract cause from the user given text"),
    ResponseSchema(name="effect", description="extract effect from the user given text")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='cause', description='extract cause from the user given text', type='string'), ResponseSchema(name='effect', description='extract effect from the user given text', type='string')])

In [125]:
## zero shot prompt tempelate

zero_shot_template = """
        The task is to extract the cause and effect from the given financial {text}. Don't provide anything except cause and effect.
        
        Instructions:
        - Answer the question based only on the text provided. Extracted cause and effect should be part of given input financial text.
        - Only one set of Cause and effect has to be given as output for each single input.
        - Output language has to be the same as the input language.

        {format_instructions}
        """


In [126]:
### Few shot template

few_shot_template = """
        The task is to extract the cause and effect from the given financial {text}. Don't provide anything except cause and effect. 
        
        Here are few examples:

        {few_shot_examples}
        
        
        Instructions:
        - Answer the question based only on the text provided. Extracted cause and effect should be part of given input financial text.
        - Only one set of Cause and effect has to be given as output for each single input.
        - Output language has to be the same as the input language.

        {format_instructions}
        """

In [127]:
def get_few_shot_examples(training_csv_path, num_few_shot):
    train_df = pd.read_csv(training_csv_path, delimiter = ";", on_bad_lines = 'warn')
    if SAMPLE_EXAMPLES:
        train_df = train_df.sample(num_few_shot)
    else:
        train_df = train_df.head(num_few_shot)
    
    few_shot_example = ''
    count = 1
    for index, row in train_df.iterrows():
        text = row['Text']
        cause = row['Cause']
        effect = row['Effect']
        few_shot_example = few_shot_example + 'Example ' +str(count) + ':\n' 
        few_shot_example = few_shot_example + 'Text: ' + text + '\n'
        few_shot_example = few_shot_example + 'cause: ' + cause + '\n'
        few_shot_example = few_shot_example + 'effect: ' + effect + '\n\n'
        count = count + 1
    return few_shot_example

In [128]:
def build_prompt():
    format_instructions = output_parser.get_format_instructions()
    if USE_ZERO_SHOT:
        print('Zero shot setting')
        prompt_template = zero_shot_template
        input_vars = ["text"]
    elif USE_FEW_SHOT:
        print('Few shot setting')
        prompt_template = few_shot_template
        input_vars = ["text", "few_shot_examples"]
    else:
        raise Exception('Learning method not supported!')
        
        
    prompt = ChatPromptTemplate(
        messages=[
            HumanMessagePromptTemplate.from_template(prompt_template)
        ],
        input_variables=input_vars,
        partial_variables={"format_instructions": format_instructions}
    )
    return prompt

In [129]:
def get_cause_effect(text, prompt, few_shot_examples=''):
    if USE_ZERO_SHOT:
        _input = prompt.format_prompt(text=text)
    elif USE_FEW_SHOT:
        _input = prompt.format_prompt(text=text, few_shot_examples=few_shot_examples)
    
    output = model(_input.to_messages())
    response = output_parser.parse(output.content)

    return response

In [None]:
# Training Dataset 
if IS_TRAINING:
    # Spanish dataset
    if IS_SPANISH:
        data_csv_path = os.path.join(DATASET_PATH, 'training_subtask_es.csv')
        training_csv_path = os.path.join(DATASET_PATH, 'training_subtask_es.csv')
        out_path = os.path.join(OUTPUT_DIR, TRAIN_SPANISH_DIR)
        out_csv_name = 'training_subtask_results_es.csv'
        
    # English dataset
    elif IS_ENGLISH:
        data_csv_path = os.path.join(DATASET_PATH, 'training_subtask_en.csv')
        training_csv_path = os.path.join(DATASET_PATH, 'training_subtask_en.csv')
        out_path = os.path.join(OUTPUT_DIR, TRAIN_ENGLISH_DIR)
        out_csv_name = 'training_subtask_results_en.csv'

# Practice Dataset
elif IS_PRACTICE:
    # Spanish dataset
    if IS_SPANISH:
        data_csv_path = os.path.join(DATASET_PATH, PRACTICE_SPANISH_DIR, 'practice_subtask_es.csv')
        training_csv_path = os.path.join(DATASET_PATH, 'training_subtask_es.csv')
        out_path = os.path.join(OUTPUT_DIR, PRACTICE_SPANISH_DIR)
        out_csv_name = 'practice_subtask_results_es.csv'
    # English dataset
    elif IS_ENGLISH:
        data_csv_path = os.path.join(DATASET_PATH, PRACTICE_ENGLISH_DIR, 'practice_subtask_en.csv')
        training_csv_path = os.path.join(DATASET_PATH, 'training_subtask_en.csv')
        out_path = os.path.join(OUTPUT_DIR, PRACTICE_ENGLISH_DIR)
        out_csv_name = 'practice_subtask_results_en.csv'
    
# df
df = pd.read_csv(data_csv_path, delimiter = ";", on_bad_lines = 'warn')
print('dataframe shape: ',df.shape)
df.head()

In [131]:
prompt = build_prompt()

if USE_FEW_SHOT:
    few_shot_examples = get_few_shot_examples(training_csv_path, NUM_FEW_SHOT_EXAMPLES)


Few shot setting


In [None]:
cause_list = []
effect_list = []
index_list = []
# default offset zero. Sometimes open ai stops sending the response
offset = 0
for i in range(len(df)):
    print(f"Processing Input {i+offset}")
    input_text = df['Text'][i+offset]
    if USE_ZERO_SHOT:
        response = get_cause_effect(input_text, prompt)
    elif USE_FEW_SHOT:
        response = get_cause_effect(input_text, prompt, few_shot_examples)
        
    cause_list.append(response['cause'])
    effect_list.append(response['effect'])
    index_list.append(df['Index'][i+offset]) 

In [None]:
fincaual_df = pd.DataFrame(columns=['Index', 'Text', 'Cause', 'Effect'])
fincaual_df['Index'] = index_list
#fincaual_df['Text'] = df['Text'].values
fincaual_df['Cause'] = cause_list
fincaual_df['Effect'] = effect_list
fincaual_df.head()

In [144]:
if not os.path.exists(out_path):
    os.makedirs(out_path)
if USE_FEW_SHOT:
    fincaual_df.to_csv(os.path.join(out_path, 'few_shot_'+str(NUM_FEW_SHOT_EXAMPLES)+'_'+out_csv_name), sep=';', escapechar='"', index=False)
elif USE_ZERO_SHOT:
    fincaual_df.to_csv(os.path.join(out_path, 'zero_shot_'+str(0)+'_'+out_csv_name), sep=';', escapechar='"', index=False)