In [None]:
import pandas as pd
from openai import OpenAI
from ollama import Client

SERVER = 'http://localhost:11434'
URI = '/v1/'

MAX_OUTPUT_TOKENS = 50

#TOTAL_RUNS = [1, 2, 3]
#MODELS = ['mistral', 'llama3.1:8b', 'llama3.1:70b']
#COLLEGES = ['College of Education', 'College of Science', 'College of Engineering and Computer Science']

TOTAL_RUNS = [10000]
MODELS =  ['mistral', 'llama3.1:8b', 'llama3.1:70b']
COLLEGES = ['College of Education', 'College of Engineering and Computer Science', 'College of Science', ]

PROMPT_FILE = 'prompt_simple.txt'

# Set the CSV file and Read the CSV
subject_headings_after_cleanup = "../DATA/subject_headings_after_cleanup_without_blanks.csv"
df = pd.read_csv(subject_headings_after_cleanup) 

# additional information for the LLM
ADDITIONAL_INSTRUCTION = ", ONLY PRINT THE HEADINGS AND NOTHING ELSE. OUTPUT THE HEADINGS AS A COMMA SEPARATATED VALUE."


In [None]:
# ---------------------------------------------------------------------------------------------
# Methods areas
def gen_prompt(PROMPT_FILE, college_df, COLLEGE):
    with open(PROMPT_FILE, 'r') as file: 
        prompt = file.read()   
    unique_headings_list = sorted(college_df['Subject Headings'].unique())
    just_headings =  ", ".join(f"'{item}'" for item in unique_headings_list)    
    prompt = prompt + " " + "[" + just_headings + "]" 
    return prompt


# diplay dataframe info
def get_data_info(selected_columns):    
    print ("--------------------------------------------")
    print("Number of rows:", selected_columns.shape[0])
    print("Number of columns:", selected_columns.shape[1])
    #selected_columns.info()


def instantiate_client():
    client = OpenAI(
        base_url= SERVER + URI,
        api_key='ollama', 
    )
    return client


def llama_get_response_oai(client, MODEL, prompt, query):
    
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": prompt },
            {"role": "user", "content": query}
        ],
        temperature = 1, 
        max_tokens = 100, 
        top_p = 1,
        model = MODEL,
    )  
    response = chat_completion.choices[0].message
    return response


def llama_generate_prompt( title, abstract, additional_intructions = ""):
    title = str(title)
    abstract = str(abstract)    
    query = "Please find the subject headings for this example: \n" + title + "\n" +  abstract + "\n" + additional_intructions
    return query


def llama_gen_single(chosen_index):  
    count = 1    
    title = college_df['Title'][chosen_index]
    abstract = college_df['Abstract'][chosen_index]
    subject_heading = college_df['Subject Headings'][chosen_index]   
    query = llama_generate_prompt( title, abstract, ADDITIONAL_INSTRUCTION)    
    
    response = llama_get_response_oai(client, prompt, query)    
    
    predicted_headings = response.content
    foundHeading = predicted_headings.find(subject_heading)!= -1
    predicted_headings = response.content        
    llama_generate_report(title, subject_heading, predicted_headings, count , foundHeading )


def llama_generate_report(title, subject_heading, predicted_headings, count, foundHeading ):
    print("\nTitle: ", title)
    print("ACTUAL Subject Headings: ", subject_heading) 
    print("PREDICTED Subject Headings: ",  predicted_headings.split('\n'))
    print ("Counted so far: ", count)
    print ("Heading Found so far: ", foundHeading)       
    print ("% Found Rate = ", (foundHeading * 100)/count)
    print("\n")
    

def llama_just_stat(MODEL, count, foundHeading):
    print ("Summary for MODEL: ", MODEL)
    print ("Counted : ", count)
    print ("Heading Found : ", foundHeading)       
    print ("% Found Rate = ", (foundHeading * 100)/count)
 
# ---------------------------------------------------------------------------------------------
# Run for all data

def llama_gen_all(client, MODEL, prompt, NUMBER_OF_ITERATION ):
    
    count = 0
    foundHeading = 0
    
    for index, row in college_df.iterrows():
        count = count + 1
        if count > NUMBER_OF_ITERATION:
            break
            
        title = row['Title']
        abstract = row['Abstract']
        subject_heading =str(row['Subject Headings'])
        
        query = llama_generate_prompt( title, abstract, ADDITIONAL_INSTRUCTION)    
        
        response = llama_get_response_oai(client, MODEL, prompt, query)      
    
        # convert the response to lower case
        predicted_headings = response.content.lower()  
    
        # convert the response into a list
        # predicted_headings_list = predicted_headings
    
        # Tally the found headings
        if (predicted_headings.find(subject_heading) != -1):
                foundHeading = foundHeading + 1
          
        #print report
        #llama_generate_report(title, subject_heading, predicted_headings, count, foundHeading )    

    llama_just_stat(MODEL, count-1, foundHeading)

# -------------------------------------------------------------------------------------------------
#  Main Program

# process a single abstract by providing the index
# llama_gen_single(1)

client = instantiate_client()

for COLLEGE in COLLEGES:
    
    # Get College data
    college_df = df[df['College'] == COLLEGE].reset_index(drop=True)
    
    # Data info
    print ("\nCOLLEGE: ", COLLEGE)
    get_data_info(college_df)   

    # generate the prompt on the fly
    prompt = gen_prompt(PROMPT_FILE, college_df, COLLEGE)
  
    for MODEL in MODELS:
        for NUMBER_OF_ITERATION in TOTAL_RUNS:         
            print(f"\nModel: {MODEL}, Number of iteration: {NUMBER_OF_ITERATION}")
            llama_gen_all(client, MODEL, prompt, NUMBER_OF_ITERATION)            
        print ("\n") 
            
print ("\n******** DONE ********")