In [14]:
from load_data import read_data, read_env
import os

ENV_PATH = os.getcwd()[:-3]
DATA_PATH = os.getcwd()[:-3] + "/data"

env_vars = read_env(ENV_PATH)
df = read_data(DATA_PATH)

GPT_API_KEY = env_vars["GPT_API_KEY"]

df.head()

# Filter out empty rows
df = df.dropna(subset=["Labelling"])

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    HumanMessage,
    SystemMessage
)

def get_response(summary_entry):
    messages = [
        # SystemMessage(content="You are a helpful assistant that provides concise summary of about 4 to 6 sentences. No other fluff is added to your response."),
        # SystemMessage(content="You are a corporate lending specialist that provides concise summary of about 4 to 6 sentences. No other fluff is added to your response."),
        SystemMessage(content="You are a helpful assistant that provides a short summary of the companies overview. No other unnecessary information is added to your response."),
        HumanMessage(content=summary_entry)
    ]
    chat = ChatOpenAI(temperature=0, openai_api_key=GPT_API_KEY)

    return chat(messages).content

In [16]:
from rouge import Rouge

scores = []
ai = []

for _, row in df.iterrows():
    # Initialize the Rouge object
    rouge = Rouge()

    # Get summaries
    ai_summary = get_response(row['item_1_short'])
    human_summary = row['Labelling']

    # Compute ROUGE scores
    score = rouge.get_scores(ai_summary, human_summary)
    scores.append(score[0]['rouge-l']['f'])
    ai.append(ai_summary)

df["ai_summary"] = ai
df["scores"] = scores

In [None]:
sum(scores) / len(scores)

0.4110147118461211

In [None]:
from openai.embeddings_utils import get_embedding, cosine_similarity

cosine_score = []

for _, row in df.iterrows():
    human_summary = get_embedding(text=row["Labelling"], engine="text-embedding-ada-002")
    ai_summary = get_embedding(text=row["ai_summary"], engine="text-embedding-ada-002")

    cosine_score.append(cosine_similarity(human_summary, ai_summary))

df["cosine_score"] = cosine_score

In [None]:
sum(cosine_score) / len(cosine_score)

0.9532037006836448

In [None]:
df.to_csv('results_improved_prompt_3.csv', index=False)