In [1]:
# Import necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

  from .autonotebook import tqdm as notebook_tqdm


### T5-small

In [2]:
# Load pre-trained T5-Small model and tokenizer
def load_model_tokenizer():
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    return model, tokenizer, device

In [3]:
# Function to summarize text using T5-Small
def summarize_text(model, tokenizer, device, input_text, min_length=30, max_length=100):
    input_ids = tokenizer("summarize: " + input_text, return_tensors='pt').input_ids.to(device)
    attention_mask = tokenizer("summarize: " + input_text, return_tensors='pt').attention_mask.to(device)
    
    # Generate summary
    output = model.generate(input_ids, 
                             attention_mask=attention_mask, 
                             num_beams=4, 
                             no_repeat_ngram_size=2, 
                             min_length=min_length, 
                             max_length=max_length)
    
    # Convert generated IDs to text
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary


In [13]:

# Main function
def main():
    model, tokenizer, device = load_model_tokenizer()
    
    # Example input text for summarization
    input_text = f"""
    Your task to summarize the transcribtion of the online meeting. Separate the dialog on logical parts and describe opinion of each speaker related to this part.
    Here the transcribtion:
    
    {test_transribtion}
    """
    
    # Summarize the input text
    summary = summarize_text(model, tokenizer, device, input_text)
    print("Summary:")
    print(summary)

if __name__ == "__main__":
    main()

Summary:
our Q2 sales are up by 15% compared to last year, with a total revenue of $1.2 million . our product team is working on some exciting updates that will further appeal to our target audience.


### T5-Base

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import re

# Load pre-trained T5-Base model and tokenizer
def load_model_tokenizer():
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    return model, tokenizer, device

# Function to summarize text using T5-Base
def summarize_text(model, tokenizer, device, input_text, min_length=30, max_length=500):
    input_ids = tokenizer("summarize: " + input_text, return_tensors='pt').input_ids.to(device)
    attention_mask = tokenizer("summarize: " + input_text, return_tensors='pt').attention_mask.to(device)
    
    # Generate summary
    output = model.generate(input_ids, 
                             attention_mask=attention_mask, 
                             num_beams=4, 
                             no_repeat_ngram_size=2, 
                             min_length=min_length, 
                             max_length=max_length)
    
    # Convert generated IDs to text
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# Main function
def main():
    model, tokenizer, device = load_model_tokenizer()
    cleaned_text = ''
    with open("../data/Sample.txt", "r") as file:
        content = file.read()
        text_without_timestamps = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', content)
        cleaned_text = re.sub(r'^\d$\n', '', text_without_timestamps, flags=re.MULTILINE)
    print(cleaned_text)
    # Example input text for summarization
    input_text = f"""
    Your task to summarize the transcribtion of the online meeting. Separate the dialog on logical parts and describe what each specker said in term of this theme.
    Here the transcribtion:
    {cleaned_text}
    """
    
    # Summarize the input text
    summary = summarize_text(model, tokenizer, device, input_text)
    print("Summary:")
    print(summary)

if __name__ == "__main__":
    main()


[SPEAKER_00]: And today, as Jim proved to us, without talking about artificial intelligence and large language models, I typically say artificial intelligence is autocorrect on steroids because all a large language model does is it predicts what's the most likely next word that you're going to use and then it extrapolates from there.


[SPEAKER_00]: So not really very intelligent.


[SPEAKER_00]: Obviously, the impact that it has on our lives and on the reality we live in is significant.


[SPEAKER_00]: Do you think we will see LLM written code that is submitted to you as a progress?


[SPEAKER_01]: I'm convinced it's going to happen, yes.


[SPEAKER_01]: And it may well be happening already, maybe on a smaller scale where people really use it more as a help in writing code.


[SPEAKER_01]: It's clearly something where automation has always helped people write code.


[SPEAKER_01]: I mean, this is not anything new at all.


[SPEAKER_01]: We don't write machine code anymore.

10

[SPEA

### Gemma 2b

In [9]:
from llama_cpp import Llama
import re

llm = Llama.from_pretrained(
	repo_id="bartowski/gemma-2-2b-it-GGUF",
	filename="gemma-2-2b-it-Q6_K_L.gguf",
)

llama_model_loader: loaded meta data with 39 key-value pairs and 288 tensors from /home/nick/.cache/huggingface/hub/models--bartowski--gemma-2-2b-it-GGUF/snapshots/855f67caed130e1befc571b52bd181be2e858883/./gemma-2-2b-it-Q6_K_L.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 2 2b It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-2
llama_model_loader: - kv   5:                         general.size_label str              = 2B
llama_model_loader: - kv   6:                    

In [13]:
def clean_text(input_text):
    """
    Removes lines containing only numbers from the input text.

    Args:
        input_text (str): The text to be cleaned.

    Returns:
        str: The cleaned text with number-only lines removed.
    """
    # Split the input text into lines
    lines = input_text.split('\n')
    
    # Filter out lines that contain only numbers (possibly with leading/trailing whitespace)
    cleaned_lines = [line for line in lines if not re.match(r'^\s*\d+\s*$', line)]
    
    # Join the cleaned lines back into a single string
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

In [14]:
USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn><eos>\n"

SUMMARIZE_PROMPT = """Online Meeting Summary & Insights Assistant
Task:
Analyze Discussion Structure: Identify and label key topics/subtopics discussed during the meeting.
Speaker-wise Subtopic Summaries: For each subtopic, provide a brief summary (1-2 sentences) of the key points mentioned by each relevant speaker.
Meeting Conclusion & Key Takeaways:
Short Conclusion (2-3 sentences): Outline the overall discussion and meeting outcome.
Emphasized Key Points (bullet points): Highlight the most crucial decisions, actions, or agreements from the meeting.
Input:

Meeting Transcript: 

{script}

Desired Output Format:

**Topic 1: [ Brief Topic Description ]**
* **Speaker [ID]**: [ Brief Summary of Speaker's Key Points (1-2 sentences) ]
* **Speaker [ID]**: [ Brief Summary of Speaker's Key Points (1-2 sentences) ]
*...

**Topic 2: [ Brief Topic Description ]**
* **Speaker [ID]**: [ Brief Summary of Speaker's Key Points (1-2 sentences) ]
* **Speaker [ID]**: [ Brief Summary of Speaker's Key Points (1-2 sentences) ]
*...

**Conclusion:**
* Brief summary of the overall discussion and meeting outcome (2-3 sentences)

**Key Takeaways:**
* • Crucial Decision/Action 1
* • Crucial Decision/Action 2
* •...

Answer:"""

cleaned_text = ''
with open("../data/Sample.txt", "r") as file:
    content = file.read()
    text_without_timestamps = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', content)
    cleaned_text = re.sub(r'^\d$\n', '', text_without_timestamps, flags=re.MULTILINE)
    cleaned_text = clean_text(cleaned_text)

final_prompt = USER_CHAT_TEMPLATE.format(
        prompt=SUMMARIZE_PROMPT.format(script=cleaned_text)
    )
final_prompt

"<start_of_turn>user\nOnline Meeting Summary & Insights Assistant\nTask:\nAnalyze Discussion Structure: Identify and label key topics/subtopics discussed during the meeting.\nSpeaker-wise Subtopic Summaries: For each subtopic, provide a brief summary (1-2 sentences) of the key points mentioned by each relevant speaker.\nMeeting Conclusion & Key Takeaways:\nShort Conclusion (2-3 sentences): Outline the overall discussion and meeting outcome.\nEmphasized Key Points (bullet points): Highlight the most crucial decisions, actions, or agreements from the meeting.\nInput:\n\nMeeting Transcript: \n\n\n[SPEAKER_00]: And today, as Jim proved to us, without talking about artificial intelligence and large language models, I typically say artificial intelligence is autocorrect on steroids because all a large language model does is it predicts what's the most likely next word that you're going to use and then it extrapolates from there.\n\n\n[SPEAKER_00]: So not really very intelligent.\n\n\n[SPEAKE

In [15]:
complition = llm.create_chat_completion(
	messages = [
		{
			"role": "user",
			"content": final_prompt
		}
	]
)
answer = complition['choices'][0]['message']['content']

ValueError: Requested tokens (1618) exceed context window of 512