# Fact Generator

In [54]:
!pip install --upgrade transformers sentencepiece symspellpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [55]:
prompt_context="cars"

In [56]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import random

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_random_fact(prompt=prompt_context, max_length=50, temperature=0.7, top_p=0.85):
    # Set a random seed for more varied output
    seed = random.randint(0, 10000)  # Generate a random seed
    torch.manual_seed(seed)
    random.seed(seed)

    # Tokenize input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text based on the input prompt using nucleus sampling (top_p)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=1,  # Generate 3 different facts
            no_repeat_ngram_size=2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True  # Enable sampling for varied output
        )

    # Decode the generated outputs
    facts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    # Choose a random fact from the generated ones
    random_fact = random.choice(facts)
    
    # Extract fact after the prompt and remove trailing whitespace
    fact = random_fact[len(prompt):].strip()
    
    return fact

# Generate and print a random fact
random_fact = generate_random_fact()
print("Random Fact:", random_fact)
# random_fact=prompt_context + " "+ random_fact

Random Fact: .

"We are working to make sure that the new cars have a strong performance base and that they have good reliability. This is why we have taken a firm stance against any changes to the cars," said Luyendyk.


In [57]:
import re

def clean_text(text):
    # Remove special characters (, . " ) ( { } etc.)
    cleaned_text = re.sub(r"[^\w\s]", "", text)
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    # Strip leading and trailing whitespace
    return cleaned_text.strip()

cleaned_text = clean_text(random_fact)
print(random_fact)


.

"We are working to make sure that the new cars have a strong performance base and that they have good reliability. This is why we have taken a firm stance against any changes to the cars," said Luyendyk.


# Text Correction

In [58]:
# Write the incorrect text here. 
# random_fact="New Text"

## Method 1

In [59]:
from transformers import pipeline

fix_spelling = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", device=-1)

print(fix_spelling(random_fact, max_length=2048))


Device set to use cpu


[{'generated_text': '. Applause We are working to make sure that the new cars have a strong performance base and that they have good reliability. This is why we have taken a firm stance against any changes to the cars, said Luyendyk.'}]


## Method 2

In [60]:
from symspellpy import SymSpell, Verbosity

def spell_check(text):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = "en-80k.txt"
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    # Split the text into words
    words = text.split()
    corrected_words = []

    for word in words:
        # Get suggestions for each word
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        
        # If suggestions exist, take the first one; otherwise, keep the original word
        corrected_words.append(suggestions[0].term if suggestions else word)

    return " ".join(corrected_words)


corrected_text = spell_check(random_fact)

print("Original Text:", random_fact)
print("Corrected Text:", corrected_text)



2024-12-09 15:05:52,145: E symspellpy.symspellpy] Dictionary file not found at en-80k.txt.


Original Text: .

"We are working to make sure that the new cars have a strong performance base and that they have good reliability. This is why we have taken a firm stance against any changes to the cars," said Luyendyk.
Corrected Text: . "We are working to make sure that the new cars have a strong performance base and that they have good reliability. This is why we have taken a firm stance against any changes to the cars," said Luyendyk.


# Voice Generation

In [61]:
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
from IPython.display import Audio

# Load the TTS pipeline
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")

# Load speaker embedding dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Generate speech
speech = synthesiser(corrected_text, forward_params={"speaker_embeddings": speaker_embedding})

# Save audio to a file
sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])

# Play audio in the notebook
Audio(data=speech["audio"], rate=speech["sampling_rate"])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
