In [None]:
import discovery_utils

In [None]:
from src import PROJECT_DIR
import pandas as pd


In [None]:
from discovery_utils.utils.llm.batch_check import (
    LLMProcessor,
    generate_system_message
)

## Crunchbase data

In [None]:
labels = ['yes', 'no', 'maybe']
review_labels = {
    'y': 'yes',
    'Yes - CR': 'yes',
    'Yes - CR ': 'yes',
    'y-CR': 'yes',
    'Maybe - CR': 'maybe',
    'maybe': 'maybe',
    'Maybe': 'maybe',
    'n': 'no',
    'no': 'no',
    'No - CR': 'no',
    'No': 'no'
}

reviewed_data_df = (
    pd.read_csv(PROJECT_DIR / "data/2024_12_MS/Cybersecurity - Mission studio 2012-12-16 - crunchbase.csv")
    .rename(columns={"RELEVANT?": "relevant"})
    .assign(relevant=lambda x: x["relevant"].map(review_labels))
)

In [None]:
data_to_check_df = reviewed_data_df.query("relevant in @labels").copy()
text_data = dict(zip(data_to_check_df["id"].tolist(), data_to_check_df["short_description"].tolist()))
len(data_to_check_df)

## Check with gpt-4o-mini

In [None]:
output_file = PROJECT_DIR / "data/2024_12_MS/llm_check_v3.jsonl"

In [None]:
gpt_model = "gpt-4o-mini"
system_message = generate_system_message("config.yaml")

processor = LLMProcessor(
    output_path=output_file,
    system_message=system_message,
    session_name="testing"
)


In [None]:
processor.run(text_data)

In [None]:
def get_confusion_matrix(data_to_check_df, output_file) -> pd.DataFrame:
    df = (
        pd.read_json(output_file, lines=True)
        .merge(data_to_check_df, on='id', how='left')
    )
    # make a confusion matrix between columns is_relevant and relevant
    confusion_matrix = pd.crosstab(df["is_relevant"], df["relevant"])
    # make a confusion matrix with percentages
    confusion_matrix_p = confusion_matrix.div(confusion_matrix.sum(axis=0), axis=1)
    return confusion_matrix, confusion_matrix_p, df

In [None]:
confusion_matrix, confusion_matrix_p, df = get_confusion_matrix(data_to_check_df, output_file)

In [None]:
confusion_matrix

In [None]:
confusion_matrix_p

In [None]:
df.query("relevant != 'maybe'").assign(agree=lambda x: x["is_relevant"] == x["relevant"])["agree"].mean()

In [None]:
df.query("is_relevant == 'no' and relevant == 'yes'")["short_description"].tolist()

## Check with gpt-4o

In [None]:
output_file = PROJECT_DIR / "data/2024_12_MS/llm_check_gpt4o.jsonl"
gpt_model = "gpt-4o"

processor = LLMProcessor(
    output_path=output_file,
    system_message=system_message,
    session_name="testing",
    model_name=gpt_model
)


In [None]:
processor.run(text_data)

In [None]:
df_gpt4o = (
    pd.read_json(output_file, lines=True)
    .merge(data_to_check_df, on='id', how='left')
)
# make a confusion matrix between columns is_relevant and relevant
confusion_matrix = pd.crosstab(df_gpt4o["is_relevant"], df_gpt4o["relevant"])
# make a confusion matrix with percentages
confusion_matrix_p = confusion_matrix.div(confusion_matrix.sum(axis=0), axis=1)
confusion_matrix_p

In [None]:
confusion_matrix

In [None]:
df_gpt4o.query("relevant != 'maybe'").assign(agree=lambda x: x["is_relevant"] == x["relevant"])["agree"].mean()

In [None]:
df_merged = df.merge(df_gpt4o[['id', 'is_relevant']], on='id', suffixes=('', '_gpt4o'))
pd.crosstab(df_merged["is_relevant"], df_merged["is_relevant_gpt4o"])

In [None]:
df_merged.query("is_relevant == 'no' and is_relevant_gpt4o == 'yes'")["short_description"].tolist()

## UKRI data

In [None]:
labels = ['yes', 'no', 'maybe']
review_labels = {
    'y': 'yes',
    'yes': 'yes',
    'Maybe?': 'maybe',
    'n': 'no',
}

reviewed_data_df = (
    pd.read_csv(PROJECT_DIR / "data/2024_12_MS/Cybersecurity - Mission studio 2012-12-16 - ukri.csv")
    .rename(columns={"RELEVANT?": "relevant"})
    .assign(relevant=lambda x: x["relevant"].map(review_labels))
)

data_to_check_df = reviewed_data_df.query("relevant in @labels").copy()
text_data = dict(zip(data_to_check_df["id"].tolist(), data_to_check_df["abstractText"].tolist()))
len(data_to_check_df)

### Checking with gpt-4o-mini

In [None]:
output_file = PROJECT_DIR / "data/2024_12_MS/llm_check_gtr_gpt-4o-mini.jsonl"

In [None]:
system_message = generate_system_message("config.yaml")
gpt_model = "gpt-4o-mini"
fields = [
    {"name": "is_relevant", "type": "str", "description": "A one-word answer: 'yes' or 'no'."},
    {"name": "relevance_score", "type": "float", "description": "Score between 0 to 1 indicating how relevant it is: 0 = not relevant at all; 0.5 = maybe relevant; 1.0 = highly relevant."}
]
processor = LLMProcessor(
    output_path=output_file,
    system_message=system_message,
    session_name="testing",
    model_name=gpt_model,
    output_fields=fields
)


In [None]:
processor.run(text_data)

In [None]:
confusion_matrix, confusion_matrix_p, df = get_confusion_matrix(data_to_check_df, output_file)

In [None]:
confusion_matrix

In [None]:
confusion_matrix_p

In [None]:
df.query("relevant != 'maybe'").assign(agree=lambda x: x["is_relevant"] == x["relevant"])["agree"].mean()

In [None]:
df.query("is_relevant == 'no' and relevant == 'yes'")["title"].tolist()

In [None]:
# get stats
df.query("is_relevant == 'no' and relevant == 'no'")["relevance_score"].describe()

In [None]:
# get stats
df.query("is_relevant == 'no' and relevant == 'yes'")["relevance_score"].describe()

In [None]:
# get stats
df.query("is_relevant == 'yes' and relevant == 'no'")["relevance_score"].describe()

In [None]:
# get stats
df.query("is_relevant == 'yes' and relevant == 'yes'")["relevance_score"].describe()

### Checking with gpt-4o

In [None]:
output_file = PROJECT_DIR / "data/2024_12_MS/llm_check_gtr_gpt-4o.jsonl"

In [None]:
system_message = generate_system_message("config.yaml")
gpt_model = "gpt-4o"
fields = [
    {"name": "is_relevant", "type": "str", "description": "A one-word answer: 'yes' or 'no'."},
    {"name": "relevance_score", "type": "float", "description": "Score between 0 to 1 indicating how relevant it is: 0 = not relevant at all; 0.5 = maybe relevant; 1.0 = highly relevant."}
]
processor = LLMProcessor(
    output_path=output_file,
    system_message=system_message,
    session_name="testing",
    model_name=gpt_model,
    output_fields=fields
)


In [None]:
processor.run(text_data)

In [None]:
confusion_matrix, confusion_matrix_p, df = get_confusion_matrix(data_to_check_df, output_file)

In [None]:
confusion_matrix_p

In [None]:
df.query("relevant != 'maybe'").assign(agree=lambda x: x["is_relevant"] == x["relevant"])["agree"].mean()

In [None]:
confusion_matrix

In [None]:
df.query("is_relevant == 'yes' and relevant == 'no'")["title"].tolist()

In [None]:
# get stats
df.query("is_relevant == 'yes' and relevant == 'no'")["relevance_score"].describe()