In [15]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [14]:
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
pip install accelerate


Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.7.0-py3-none-any.whl (362 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.7.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer (using gpt2-medium for stability)
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

# Set pad_token_id to eos_token_id to avoid warnings
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate text
def generate_text(prompt, max_new_tokens=200):
    """
    Generates text based on a given prompt.
    Args:
        prompt (str): The starting text prompt
        max_new_tokens (int): Maximum number of new tokens to generate
    Returns:
        str: Generated text
    """
    # Generate multiple candidates and select the best one
    best_text = ""
    best_score = -float('inf')
    num_candidates = 5  

    for _ in range(num_candidates):
        # Tokenize the input prompt with padding and attention mask
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            return_attention_mask=True
        ).to(device)
        
        # Extract input_ids and attention mask
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        # Generate text
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            no_repeat_ngram_size=3,
            repetition_penalty=1.8,  # Slightly reduce penalty
            do_sample=True,
            top_k=50,  # Increase for more diversity
            top_p=0.9,  # Increase for more diversity
            temperature=0.8,  # Increase for more creativity
            pad_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Post-process the generated text
        sentences = generated_text.split('. ')
        # Remove the prompt from the first sentence
        if sentences[0].startswith(prompt):
            sentences[0] = sentences[0][len(prompt):].strip()
        
        # Filter out irrelevant content (relaxed filtering)
        relevant_sentences = []
        for s in sentences:
            if not s.strip():  # Skip empty sentences
                continue
            # Skip sentences with promotional content, URLs, or off-topic elements
            if any(keyword in s.lower() for keyword in [
                "apply now", "http", "free view", "signup", "course", "program", 
                "we are excited", "job offer", "game", "players", "experience points", 
                "quests", "cv", "blog", "i was"
            ]):
                continue
            relevant_sentences.append(s)
        
        # Ensure at least 4 sentences (relaxed from 5)
        if len(relevant_sentences) < 4:
            continue
        
        # Truncate to 5 sentences for consistency
        processed_text = '. '.join(relevant_sentences[:5]) + '.'
        
        # Score the text based on length (simplified scoring)
        score = len(processed_text)
        if score > best_score:
            best_score = score
            best_text = processed_text
    
    # Fallback: If no good candidate is found, generate a simpler response
    if not best_text:
        # Use a more guided prompt with relaxed parameters
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            return_attention_mask=True
        ).to(device)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens // 2,  # Shorter for simplicity
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            repetition_penalty=1.5,
            do_sample=True,
            top_k=30,
            top_p=0.95,
            temperature=0.6,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        sentences = generated_text.split('. ')
        if sentences[0].startswith(prompt):
            sentences[0] = sentences[0][len(prompt):].strip()
        relevant_sentences = [s for s in sentences if s.strip()][:5]
        best_text = '. '.join(relevant_sentences) + '.' if relevant_sentences else "Unable to generate a response."

    return best_text

# Define prompts with additional context for better guidance
prompts = [
    "Describe the future of artificial intelligence and its impact on society. Focus on how AI might improve healthcare, education, and transportation, while addressing potential challenges like job displacement and ethical concerns.",
    "Tell a story about a distant galaxy where a new species discovers space travel. Include details about the species, their planet, the technology they develop, and their first journey into space, ensuring the story is both epic and fun.",
    "Provide practical tips for a student to succeed in an AI internship. Offer specific advice on skills to learn, how to collaborate with a team, and how to stand out during the internship."
]

# Define absolute path for output file
output_path = r"C:\Users\ASUS\Desktop\Codtech-AI-Internship\Task4\generated_text.txt.txt"

# Generate and save text
with open(output_path, "w") as f:
    for prompt in prompts:
        generated_text = generate_text(prompt)
        f.write(f"Prompt: {prompt}\n")
        f.write(f"Generated Text: {generated_text}\n\n")