#### Import Modlues

In [1]:
import sys, os
import cohere
from dotenv import load_dotenv
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from read_write_util import ReadWriteUtil
from prompt_pipeline import PromptPipeline

#### Read Cleaned Data

In [2]:
reader = ReadWriteUtil()
API_KEY = os.getenv('API_KEY')
# print(API_KEY)
#Initialize reader and API_KEY

In [3]:
train_data =  reader.dvc_get_data('../data/job_description_train_cleaned.json', 'jdtrain_v1_cleaned')
train_data[0]

2022-09-14 18:37:19,602:logger:../data/job_description_train_cleaned.json with version jdtrain_v1_cleaned Loaded


{'document': "Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",
 'tokens': [{'text': 'Bachelor', 'entityLabel': 'DIPLOMA'},
  {'text': 'Mechanical Engineering', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': 'Physical Science', 'entityLabel': 'DIPLOMA_MAJOR'},
  {'text': '3+ years', 'entityLabel': 'EXPERIENCE'},
  {'text': 'developing', 'entityLabel': 'SKILLS'},
 

In [4]:
test_data = reader.dvc_get_data('../data/job_description_test_cleaned.json', 'jdtest_v1_cleaned')
test_data[0]

2022-09-14 18:37:19,665:logger:../data/job_description_test_cleaned.json with version jdtest_v1_cleaned Loaded


{'document': '\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperie

### Try Different Prompt Techniques

`We are going to use cohere to extract entities from the job description, we are going to use different techniques to come up with an effective and dynamic prompt`

#### Initialize our Cohere Client

In [5]:
co = cohere.Client(API_KEY)

#### Technique 1: Choosing the first 5 in our training json as a prompt to be used by cohere's LLM Generate

In [21]:
tech1_train_data = train_data[:5]
tech1_train_data
#Picking the first two arrays from our json

[{'document': "Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",
  'tokens': [{'text': 'Bachelor', 'entityLabel': 'DIPLOMA'},
   {'text': 'Mechanical Engineering', 'entityLabel': 'DIPLOMA_MAJOR'},
   {'text': 'Physical Science', 'entityLabel': 'DIPLOMA_MAJOR'},
   {'text': '3+ years', 'entityLabel': 'EXPERIENCE'},
   {'text': 'developing', 'entityLabel': 'SKILL

#### Extract Values from json

In [22]:
propmt_pipeline = PromptPipeline()

`Lets use each of our pipeline function for this experiment`

In [23]:
val = propmt_pipeline.extract_values(tech1_train_data)
#The first thing is to extract all the data from our json

2022-09-14 18:48:37,264:logger:Values extracted.


In [29]:
tech1_test_data = test_data[7]
tech1_test_data
#Second pick job description from our test data lets pick the 2nd element

{'document': '\nGraduating with a Ph.D. in Computer Science or related field or relevant experience.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in leading theoretical and empirical research and for answering questions with research.\nExperience communicating research for public audiences of peers.\nKnowledge in a programming language.\nMust be able to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \nPrevious internship(s) and/or research assistantship(s) in an AI research organization.\nExperience in developing and debugging in C/C++, Python, C# and/or Java.\nHas previously completed a postdoctoral role in AI research.',
 'tokens': [{'text': 'Ph.D.', 'entityLabel': 'DIPLOMA'},
  {'text': 'Computer Science', 'entityLa

In [30]:
test_token_dict = propmt_pipeline.get_tokens(tech1_test_data)
test_token_dict
#Get all the tokens in dict format from our test data

2022-09-14 18:56:15,305:logger:Tokens converted to Dict


{'DIPLOMA': 'Ph.D.', 'DIPLOMA_MAJOR': 'Computer Science'}

In [34]:
prompt = f"{val} Job Description: {train_data[1]['document'].strip()}"
prompt
#Append our test text to our prompt 

"Job Description: Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience\nDIPLOMA: Bachelor\nDIPLOMA_MAJOR: Mechanical Engineering,Physical Science\nEXPERIENCE: 3+ years\nSKILLS: developing,fiber optic cables,connector related products\n\n--\nJob Description: 10+ years of software engineering work experience. Technical experience in release automation engineering, CI/

In [35]:
response = propmt_pipeline.send_request_to_cohere(co, prompt)
response
#Send our request to cohere API

2022-09-14 19:00:33,917:logger:Rquest successful


'\n EXPERIENCE: 10+ years,5+ years\n SKILLS: software engineering,technical management,release engineering,tools engineering,DevOps\n\n--'

In [36]:
response_dict = propmt_pipeline.process_response(response)
response_dict
#Process our response to convert to response dict

{'EXPERIENCE': ' 10+ years,5+ years',
 ' SKILLS': ' software engineering,technical management,release engineering,tools engineering,DevOps'}

In [37]:
tokens = propmt_pipeline.get_tokens(train_data[1])
tokens

2022-09-14 19:02:38,901:logger:Tokens converted to Dict


{'EXPERIENCE': '10+ years,5+ years',
 'SKILLS': 'software engineering,technical management,release engineering,tools engineering,DevOps',
 'DIPLOMA': 'BS/MS',
 'DIPLOMA_MAJOR': 'Computer Science'}

In [39]:
from difflib import SequenceMatcher
prediction_similarity = {}
for k,v in tokens.items():
    if k in response_dict.keys():
        prediction_similarity[k] = SequenceMatcher(None, v, response_dict[k]).ratio()
    else:
        prediction_similarity[k] = 0.0
prediction_similarity

#Get similarity dict

{'EXPERIENCE': 0.972972972972973,
 'SKILLS': 0.0,
 'DIPLOMA': 0.0,
 'DIPLOMA_MAJOR': 0.0}

`We can see that the response from the model wasn't effective`

In [18]:
my_vals = propmt_pipeline.extract_values(train_data)
with open("../data/job_description_train_fine_tune.txt", 'w')as f:
    f.write(my_vals)

2022-09-14 18:43:25,362:logger:Values extracted.


In [14]:

# response = co.generate( 
#   model='xlarge', 
#   prompt='Job Description: Bachelor\'s degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience\n\nDIPLOMA: Bachelor\nDIPLOMA_MAJOR: Mechanical Engineering, Physical Science\nEXPERIENCE: 3+ years\nSKILLS: developing, fiber optic cables, connector related products\n--\n', 
#   max_tokens=400, 
#   temperature=0.5, 
#   k=0, 
#   p=1, 
#   frequency_penalty=0, 
#   presence_penalty=0, 
#   stop_sequences=["--"], 
#   return_likelihoods='NONE') 
# print('Prediction: {}'.format(response.generations[0].text))