In [None]:
%%capture
!pip install langchain-openai
!pip install langchain
!pip install -U langchain-community

##Reading the data
Using all the functions from previous script to read the data.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, json
import numpy as np
from argparse import ArgumentParser
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from pprint import pprint
from datetime import datetime
import copy

In [None]:
#@title Reading train, dev and test data (use own folder path here and chec file names)
folder_path = 'your folder-path'

def read_json_file(name):
    with open(name, 'r') as f:
        data = json.load(f)
        return data

def read_data():
    train = read_json_file(os.path.join(folder_path, "train.json"))
    dev = read_json_file(os.path.join(folder_path, "dev.json"))
    test = read_json_file(os.path.join(folder_path, "test.json"))

    return train, dev, test

train, dev, test = read_data()
print(len(train), len(dev), len(test))

246 50 100


In [None]:
from langchain.callbacks import get_openai_callback
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import HuggingFaceHub
import time
from langchain_community.llms import HuggingFaceEndpoint
from google.colab import files

In [None]:
pro_token = "use your token"
os.environ["HUGGINGFACEHUB_API_TOKEN"]=pro_token

### zero-shot batch extraction.
with both question-guided and description-guided settings


In [None]:
#create the chain for LLM to invoke
def Gemma_batch_extraction_prompt_chain(model):
    prompt_template = PromptTemplate(
        input_variables=['instruction', 'role_descriptions', 'post', 'comment', 'type'],
        template = '''
        [INST]
        ##Instruction##
        {instruction}

        ##Post##
        {post}

        ##Comment##
        {comment}

        ##Arguments {type}##
        {role_descriptions}
        [/INST]
        Do not use more than 12 words to describe an argument. Return "null" if any arugment is not present. Return arguments in JSON. Precisely give the output, no extra description is needed.
        Provide the output between [##JSON##] [##JSON##].
        '''
    )

    prompt_chain = prompt_template | model | StrOutputParser()
    return prompt_chain

In [None]:
#If one argument has multiple values separate them by commas.
#returning the outputs by invoking the chain

def getting_Gemma_batch_outputs(model, data, instruction, role_details, extraction_type):
    arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']
    predictions = []
    invoke_count = 0
    i = 0
    for dt in data:
          data_sample = dt
          post, comment, label = dt['text1'], dt['text2'], dt['label']
          print(i, dt['doc_id'])
          raw_predictions = {}
          for arg_typ in arg_types:
              role_descriptions = role_details[label][arg_typ] #chosing the argument details from the correspond event
              roles = list(role_descriptions.keys())
              input_dict = {
                'instruction' : instruction,
                'post' : post,
                'comment' : comment,
                'role_descriptions' : role_descriptions,
                'type': extraction_type
              }
              prompt_chain = Gemma_batch_extraction_prompt_chain(model) ##creating the prompt chain
              while True: ## to get rid of model overload error
                  try:
                      output = prompt_chain.invoke(input_dict)
                      break
                  except Exception as e:
                      print(e)
                      time.sleep(3)
              invoke_count+=1 #Checking how many times I am doing the inference.
              # print(label, arg_typ)
              # pprint(output)
              raw_predictions[arg_typ] = output

          data_sample['raw-predictions'] = raw_predictions ##saving the raw-prediction will refine them later
          predictions.append(data_sample)
          i+=1

    print(model, extraction_type, invoke_count)
    return predictions

In [None]:
%%time
def read_json_file(name):
    folder_path = 'your-path'
    name = os.path.join(folder_path, name)
    with open(name, 'r') as f:
        data = json.load(f)
        return data

def save_json(json_data, file_name):
    json_data = json.dumps(json_data)
    print(file_name)
    with open(file_name, "w") as json_file:
          json_file.write(json_data)

def get_predictions(model, data_to_predict, nm_model):
      exp_types = ['description_guided', 'question_guided']
      prompt_types = ['batch']

      predictions = {}
      llm_hf = HuggingFaceEndpoint(repo_id= model, temperature=0.01, max_new_tokens=128)

      for p_type in prompt_types:
          for e_type in exp_types:
              ## getting the predefined instruction template and event_roles
              instruction_template = read_json_file('instruction_template.json')
              event_roles = read_json_file(f'role_definitions_{e_type}.json')
              instruction = instruction_template[f'{p_type}-{e_type}']

              print(model, p_type, e_type)
              # type of experiment question-guided or description guided

              if e_type=='description_guided':
                  preds = getting_Gemma_batch_outputs(llm_hf, data_to_predict, instruction, event_roles, 'Descriptions')
              else:
                  preds = getting_Gemma_batch_outputs(llm_hf, data_to_predict, instruction, event_roles, 'Questions')

              current_time = datetime.now().replace(second=0, microsecond=0).time()
              current_date = datetime.now().replace(second=0, microsecond=0).date()
              save_json(preds, f'preds_{nm_model}_{p_type}_{e_type}_{current_date}_{current_time}.json')

              #this is very very important
              predictions[f'{p_type}_{e_type}'] = copy.deepcopy(preds) #saving the outputs
      return predictions ## return the predictions for a model.


CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 11 µs


In [None]:
%%time
lm_models = ['google/gemma-1.1-7b-it'] ## change this with huggingface model string that you want to use
model_name = ['gemma-1.1-7b-it'] ## for saving the models in the folder

i=0
for x in range(1,3):
    for model in lm_models:
        predictions = get_predictions(model, test,  model_name[i]) ## right now I am doing predictions on dev set.
        nm = f'{x}-{model_name[i]}_test_100_predictions.json'
        save_json(predictions, nm)
        folder_path = 'Argument-Extraction-Predictions' #Choose your path
        save_json(predictions, os.path.join(folder_path, nm))


In [None]:
predictions