# Testing prompts

This workbook is to play around with the promting for the models that we use, to address the issues that Wilhelm identified. 

In [2]:
# Importing required libraries

# numpy is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# pandas is used for data manipulation and analysis
import pandas as pd

# TSNE from sklearn.manifold is used for dimensionality reduction
from sklearn.manifold import TSNE

# matplotlib.pyplot is used for creating static, animated, and interactive visualizations in Python
import matplotlib.pyplot as plt

# SentenceTransformer is used for training and using transformer models for generating sentence embeddings
from sentence_transformers import SentenceTransformer

# tqdm is used to make loops show a smart progress meter
from tqdm import tqdm

# torch is the main package in PyTorch, it provides a multi-dimensional array with support for autograd operations like backward()
import torch

# AutoModelForCausalLM, AutoTokenizer, pipeline are from the transformers library by Hugging Face which provides state-of-the-art machine learning models like BERT, GPT-2, etc.
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# euclidean distance and cosine distance
from scipy.spatial import distance

In [13]:
# model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer("sentence-t5-large")
# model = SentenceTransformer("sentence-transformers/gtr-t5-xxl")

# generative model
modelP = "microsoft/Phi-3-mini-128k-instruct"

tokenizer = AutoTokenizer.from_pretrained(modelP)

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
content = ['', '--\tPBSP and PBCP (including Mobile PBCP) should be able to negotiate QoS and cost for content distribution and latency.', '', '--\tPBSP and PBCP (including Mobile PBCP) should be able to request notification prior to content transmission.', '', '--\tNotification may be given to BSUs prior to streaming session initiation.', '', '--\tBSUs may be able to choose categories of content to be notified about, e.g. from an electronic service guide.', '', '--\tBSUs should be able to access PBS service using third party user interface software.', '', '--\tStatistical information (e.g number of BSUs, resource usage) should be collected and make available  to PBSP and PBCP.', '', '--\tIt should be possible that failure of BSU access (e.g. resource shortage or UE incapability) be reported to PBCP.', '', '--\tHandover within the same radio access technology should be transparent to BSU.', '', '--\tLatency for content switching should be within an acceptable bound to user perception.', '', '--\tQoS should be supported.', '', '--\tWhen macro cell and Home (e)NodeB cell are combined, service continuity between the cells should be supported. ', '']

In [15]:
#content1 = content.split(",")
#content1 = [x for x in content1[1:] if x not in ['', " ''", " '']"]]
content_str = " ".join(content)

# check if the text contains the word Figure
# if it does, then we add a footnote before to warn the user
if "Figure" in content_str:
    #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
    strContent = f"Based on this : {content_str}, write the requirement about {section} from {document}. Add this text at the beginning: '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
# the same for tables, at least the ones that we can identify
elif "Table " in content_str[:10]:
    #strContent = f"Summarize this table. {content_str}. Based on this summary, write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "
    strContent = f"Summarize this table. {content_str}. Based on this summary, write a requirement about {section} from {document}. Add this text at the beginning '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "

# and for the empty text, e.g., when the word latency is only in the title
# we do not generate anything and warn the user
elif len(content_str) < 2:
    strOut = "This section is empty. The word latency is probably only in a section title"
# otherwise, we generate the requirement
else: 
    #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall ' "
    strContent = f"Write a requirement based on this: {content_str}. "

messages = [
    {"role": "user", "content": strContent},
]

pipe = pipeline(
    "text-generation",
    model=modelP,
    tokenizer=tokenizer,
    trust_remote_code=True
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
}

output = pipe(messages, **generation_args)

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [None]:
print(output[0]["generated_text"])