In [1]:
from datasets import load_dataset
from label_generation import generate_labels
from label_clustering import cluster_labels_gpt, make_clustering_prompt


## Data Pre-processing

In [2]:
# Load dataset from Hugging Face
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("jhu-clsp/astro-llms-full-query-data")

In [12]:
# make a dictionary where keys are thread id and values are full user query
data_processed = {}
full_data = False
n_samples = len(ds['train']) if full_data else 10
for i in range(n_samples):
    segment_id = 0 # some datasets have multiple segments per document, here we only use 0 as there is only one segment
    thread_id = str(ds['train'][i]['thread_ts']) + '_' + str(segment_id)
    data_processed[thread_id] = ds['train'][i]['full_user_query']
# print first 3 items
list(data_processed.items())[:3]

[('1716992594_0',
  '<@U07524ZQSD8> How many papers published in 2022 used data from MAST missions?'),
 ('1716992673_0',
  '<@U07524ZQSD8> Can JWST measure unambiguous biosignatures in exoplanet atmospheres?'),
 ('1716992727_0', '<@U07524ZQSD8> What is the value of the Hubble Constant?')]

## Label Generation

In [13]:
# Configuration for pipeline
config = {
    # Generation Model 
"model_name": "gpt-4o-mini",

# Data
"dataset_name": "values",
"dataset_path": "/data/afield6/oida_data/processed/values/", # Change Me

# Results
"output_dir_root": "/data/afield6/oida_results/generation/", # Change Me

# CLustering
"cluster_model_name": "gpt-4o-mini",
"cluster_output_dir": "."
}

In [9]:
# Define the system prompt for label generation
system_prompt = """
We are using the queries to this bot to conduct INDUCTIVE CODING. 
The coding aims to understand what type of questions or user intent these scientists use to raise queries.

Instruction:
- Label the input only when it is HIGHLY RELEVANT and USEFUL for understanding the user intent for querying the literature search bot from the astronomy scientists.
- Then, define the phrase of the label. The label description should be observational, concise and clear.
- ONLY output the label and DO NOT output any explanation.

Format:
- Define the label using the format \"LABEL: [The phrase of the label]\". 
- If there are multiple labels, each label is a new line. 
- If the input is irrelevant, use \"LABEL: [Irrelevant]\". 
"""


In [14]:
gen_result = generate_labels(data_processed, system_prompt, config)
gen_result

100%|██████████| 10/10 [00:14<00:00,  1.43s/it]


{'1716992594': {'LLM_Annotation': [{'sentence': '<@U07524ZQSD8> How many papers published in 2022 used data from MAST missions?',
    'label': ['Inquiry about publication statistics using specific data sources']}]},
 '1716992673': {'LLM_Annotation': [{'sentence': '<@U07524ZQSD8> Can JWST measure unambiguous biosignatures in exoplanet atmospheres?',
    'label': ['Biosignature detection in exoplanet atmospheres']}]},
 '1716992727': {'LLM_Annotation': [{'sentence': '<@U07524ZQSD8> What is the value of the Hubble Constant?',
    'label': ['Inquiry about a specific astronomical constant']}]},
 '1716992771': {'LLM_Annotation': [{'sentence': '<@U07524ZQSD8> What sort of questions can you answer? Provide your response in the form of a bulleted list.',
    'label': ['User inquiry about capabilities']}]},
 '1716992843': {'LLM_Annotation': [{'sentence': '<@U07524ZQSD8> How is the emission spectrum of a sunspot different from the spectrum of the solar photosphere?',
    'label': ['Comparative ana

## Hierarchical Clustering

In [16]:
dataset = "astrobot"
cluster_prompt = make_clustering_prompt(dataset=dataset)
cluster_labels_gpt(gen_result, cluster_prompt, config)


Number of batches: 1


100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


{'User Capabilities Inquiry': ['User inquiry about capabilities',
  'Query for publication statistics'],
 'Research Focus Areas': ['Inquiry about promising research areas',
  'Inquiry about specific publication count related to cosmic noon'],
 'Publication Statistics': ['Inquiry about publication statistics using specific data sources'],
 'Astrophysical Research Topics': ['Biosignature detection in exoplanet atmospheres',
  'Inquiry about a specific astronomical constant',
  'Comparative analysis of emission spectra']}

## Evaluation

### Theme precision and recall

### Segment Precision and Recall