In [5]:
# Install necessary libraries
!pip install scikit-learn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

# Load dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['sci.space', 'rec.sport.baseball'])
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize and train classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
# Function to predict category of new text
def predict_category(text):
    text_tfidf = vectorizer.transform([text])
    prediction = clf.predict(text_tfidf)
    return newsgroups.target_names[prediction[0]]

# Example usage
sample_text = "The shuttle successfully launched into orbit."
print(f"Predicted Category: {predict_category(sample_text)}")


Model Accuracy: 1.00
Predicted Category: sci.space


In [8]:
!pip install transformers datasets
from datasets import load_dataset
from transformers import AutoTokenizer

# Load a small subset of the dataset
dataset = load_dataset("daily_dialog")
train_data = dataset["train"].select(range(100))  # Select only 100 samples

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the data
def preprocess_function(examples):
    inputs = [" ".join(dialogue) for dialogue in examples["dialog"]]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    return model_inputs

tokenized_data = train_data.map(preprocess_function, batched=True)

from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-dialogue",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
    report_to="none"
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    data_collator=data_collator,
)

# Train the model
trainer.train()
from transformers import pipeline

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt
prompt = "Hello, how are you?"

# Generate a response
response = generator(prompt, max_new_tokens=50, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)

print(response[0]['generated_text'])




Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
10,3.5064
20,3.5247
30,3.3645
40,3.2968
50,3.3447


Device set to use cpu


Hello, how are you? I'm a great teacher and I'm very excited to hear your student come to a class. I'm also a student in a class. I'm a very good teacher, and I'm very well-trained. I think that's a big deal


In [11]:
!pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
model_id = "EleutherAI/gpt-neo-125M"  # Small, CPU-friendly, ~2.7K context
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
model_id = "EleutherAI/gpt-neo-125M"  # Small, CPU-friendly, ~2.7K context
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

base_prompt = "This is a story about a village where every person has secrets. "
long_prompt = base_prompt * 50

print(f"Prompt token length: {len(tokenizer.tokenize(long_prompt))}")

response = generator(
    long_prompt,
    max_new_tokens=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)

print(response[0]['generated_text'])





Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt token length: 651
This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story about a village where every person has secrets. This is a story

In [12]:
!pip install transformers datasets
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
# Load small slices
imdb = load_dataset("imdb", split="train[:100]")
dialog = load_dataset("daily_dialog", split="train[:100]")

# Convert to instruction-style format
def format_imdb(example):
    return {
        "text": f"### Task: Sentiment Classification\nReview: {example['text']}\nSentiment: {'positive' if example['label'] == 1 else 'negative'}"
    }

def format_dialog(example):
    joined = " ".join(example["dialog"])
    return {
        "text": f"### Task: Dialogue Generation\n{joined}\nResponse:"
    }

# Apply formatting
formatted_imdb = imdb.map(format_imdb)
formatted_dialog = dialog.map(format_dialog)

# Combine datasets
combined_texts = [x["text"] for x in formatted_imdb] + [x["text"] for x in formatted_dialog]
combined_dataset = Dataset.from_dict({"text": combined_texts})
model_id = "sshleifer/tiny-gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = tokens["input_ids"]
    return tokens

tokenized_dataset = combined_dataset.map(tokenize)
model = AutoModelForCausalLM.from_pretrained(model_id)

training_args = TrainingArguments(
    output_dir="./multi-task-output",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    max_steps=5,
    logging_steps=1,
    save_strategy="no",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Try sentiment classification prompt
prompt1 = "### Task: Sentiment Classification\nReview: I loved the visuals and characters.\nSentiment:"
print(generator(prompt1, max_new_tokens=30)[0]["generated_text"])

# Try dialogue prompt
prompt2 = "### Task: Dialogue Generation\nHello, how are you? I'm doing well.\nResponse:"
print(generator(prompt2, max_new_tokens=30)[0]["generated_text"])




Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Step,Training Loss
1,10.7264
2,10.7007
3,10.7413
4,10.7387
5,10.7395


Device set to use cpu


### Task: Sentiment Classification
Review: I loved the visuals and characters.
Sentiment: ProbSherSher Jratisf credibility Daniel stairs HancockSher confir conservation reviewinghibit ESV Hancockoho trilogy Motorola vendors Money stairsScenehibitoho TAatisfiken Boonepublic
### Task: Dialogue Generation
Hello, how are you? I'm doing well.
Response: credibility confir stairs Money004dit directly Habit Rhiken Motorola Daniel vendors scalp antibiotic autonomyoother heiroother Participation subst Rh Motorola004 vendors heirdit antibiotic autonomy circumcised
