In [3]:
import keyphrase_dataset
import llm_class
import model_inference
import output_parser
import phrase_extraction_evaluation

from datetime import datetime
from constants import PHI_MODEL_NAME, EMBEDDING_MODEL_NAME
from model_eval import ModelEval


model = llm_class.LanguageModel()
keyphrase_set = keyphrase_dataset.KeyphraseDataset()
inference = model_inference.ModelInferencing(model)


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def gradio_func(text_prompt: str, samples: int, experiment_name: str) -> str:
    """
    Gradio function to run multiple samples and evaluate them.
    Args:
        text_prompt (str): The text prompt for generating key phrases.
        samples (int): The number of samples to test.
        experiment_name (str): The name of the experiment.
    Returns:
        str: The average metrics from the evaluation.
    """
    prompts.multiple_keyphrases_prompt = text_prompt
    tests = keyphrase_set.get_samples(samples)
    results = ModelEval.multiple_samples(inference, tests, save_file=experiment_name, print_counts=True)

    return (
        f"Avg cosine: {results['cosine']} | "
        f"Avg labels matched: {results['matchings']} | "
        f"Avg redundancy: {results['redundancy']} | "
        f"Avg groundness: {results['groundness']}"
    )


# Create the Gradio interface for prompt testing
interface = gr.Interface(
    fn=gradio_func,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your text prompt here...", label="Text Prompt"),
        gr.Slider(minimum=1, maximum=500, step=1, label="Number of tests"),
        gr.Textbox(lines=2, placeholder="Enter your experiment name here...", label="Experiment name"),
    ],
    outputs="text",
    title="Prompt Testing Interface",
)

# Launch the interface
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://5885a571e59563f0b4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Running test 1
Running test 2
Running test 3


In [8]:
# Debug

sample=keyphrase_set.train_set[1]

#print(model.prompt_text)

print(sample['text'])
print("\n")
print(sample['label'])
print("\n")
print(get_metrics(sample['text'],sample['label']))

E.M.T. Convicted of Sexual Attacks on 5 in Brooklyn An emergency medical technician with the Fire Department was convicted on Wednesday of a series of sexual assaults in Brooklyn, including an attack on an 11-year-old girl inside an elevator. The technician, Angus Pascall, 36, was convicted of first-degree rape, among other charges, for five separate attacks on young women and girls ages 11 to 22 stretching to 2001. Most of the assaults occurred in 2009 and 2010, the year he was arrested, the Kings County district attorney, Charles J. Hynes, said in a statement. Mr. Pascall’s lawyer, Edward Friedman, said his client would appeal the verdict. In each of the attacks, Mr. Pascall was armed, sometimes with a gun or a knife. In one attack on a 19-year-old woman in 2009, he used a machete, the district attorney said. In the assault on the 11-year-old, he used his emergency responder’s key to trap the victim inside an elevator. “Pascall then put a gun to her face and repeatedly sexually assau

In [11]:
# Debug

model.set_prompt("""You are provided with the text extracted from a webpage, delimited by < for start and > for end. Your task is to extract the key phrases from the text that best characterize the webpage. You should extract at most 10 such phrases, but may extract less. Ensure the key phrases are relevant and provide a good summary of the content. Present the key phrases in JSON format, with each key phrase being an item in a list. Do not output anything but json of extracted keyphrases.

Example webpage to extract from: <&>

Your response should look like this:&

The text from the webpage: <&>""")

In [5]:
def visualize(text: str) -> str:
    """
    Gradio function to extract and visualize key phrases from the given text.
    Args:
        text (str): The input text to extract key phrases from.
    Returns:
        str: The extracted key phrases formatted for display.
    """
    formatted_phrases = inference.get_multiple_phrases(text)
    output_text = "Key phrases in the text are:\n- " + "\n- ".join(formatted_phrases)
    
    return output_text


# Create the Gradio interface for key phrase extraction
interface = gr.Interface(
    fn=visualize,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your text prompt here...", label="Text to extract the keywords from"),
    ],
    outputs="text",
    title="Key Phrase Extractor",
)

# Launch the interface
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://c5e37cd6bfc9efc8d4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


