<a href="https://colab.research.google.com/github/nancyAfycodes/Cognizant-Skills-Accelerator---GenSpark/blob/Prompt-Engineering/RLHF_Capstone_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capstone Project


## User Story 1 - Personalized Chatbot for Customer Service

In [4]:
import transformers
import torch

def fine_tune_chatbot(user_story, solution, faq_data, model_name="distilgpt2", epochs=3, learning_rate=5e-5):
    """
    Fine-tunes a small language model (like DistilGPT-2) for a personalized chatbot.
    """
    tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_name)
    model = transformers.GPT2LMHeadModel.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id

    train_data = []
    for faq in faq_data:
        prompt = f"Question: {faq['question']}\nAnswer: {faq['answer']}\n"
        train_data.append(prompt)

    tokenized_data = tokenizer(train_data, padding=True, truncation=True, return_tensors="pt")

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(**tokenized_data, labels=tokenized_data["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    return model, tokenizer

def generate_response(model, tokenizer, question):
    """
    Generates a response from the fine-tuned chatbot.
    """
    prompt = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer by finding the "Answer:" part and taking everything after it.
    answer_start = response.find("Answer:")
    if answer_start != -1:
        answer = response[answer_start + len("Answer:"):].strip()
        return answer
    else:
        return "Sorry, I couldn't generate a response."

# Example usage
user_story = "As a business owner, I want to create a personalized chatbot that can assist customers with frequently asked questions (FAQs) so that I can provide quick responses and improve customer satisfaction."
solution = "Use a pre-trained LLM like GPT and fine-tune it with customer interaction data to generate accurate and contextually relevant responses."
faq_data = [
    {"question": "What are your business hours?", "answer": "We are open from 9 AM to 5 PM, Monday to Friday."},
    {"question": "How do I return an item?", "answer": "You can return items within 30 days of purchase. Please visit our website for more details."},
    {"question": "Do you offer international shipping?", "answer": "Yes, we offer international shipping to select countries."},
    {"question": "What payment methods do you accept?", "answer": "We accept all major Credit Cards, PayPal, and Apple Pay."},
    {"question": "How do I track my order?", "answer": "You can track your order using the tracking number provided in your shipping confirmation email."},
]

fine_tuned_model, tokenizer = fine_tune_chatbot(user_story, solution, faq_data)

#question = "What are your business hours?"
#response = generate_response(fine_tuned_model, tokenizer, question)
#print(f"Question: {question}")
#print(f"Answer: {response}")

#question = "How do I return an item?"
#response = generate_response(fine_tuned_model, tokenizer, question)
#print(f"Question: {question}")
#print(f"Answer: {response}")

question = "Do you ship to Canada?"
response = generate_response(fine_tuned_model, tokenizer, question)
print(f"Question: {question}")
print(f"Answer: {response}")

Epoch 1/3, Loss: 4.286881923675537
Epoch 2/3, Loss: 3.6967740058898926
Epoch 3/3, Loss: 3.0351016521453857
Question: What are your business hours?
Answer: 
Question: How do I return an item?
Answer: 
Question: Do you ship to Canada?
Answer: No.


## User Story 2 - Summarizations for Research Papers

### The article from the BBC is an article on how to spot fake art pieces from famous artists.

In [None]:
# import libraries
import requests
from bs4 import BeautifulSoup

# Function to extract article text
def extract_article(url):
    response = requests.get(url)

    if response.status_code != 200:
        print("Failed to retrieve the article.")
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all paragraphs in the article
    paragraphs = soup.find_all('p')  # More generic approach

    # Combine text from all paragraphs
    article_text = ' '.join([p.get_text() for p in paragraphs])

    return article_text

# Extract the article text
url = 'https://www.bbc.com/culture/article/20250311-rembrandt-to-picasso-five-ways-to-spot-a-fake-masterpiece'
article_text = extract_article(url)

# Print first 100 characters to check
print(article_text[:100])

The recent discovery of an art forger's workshop reminds us of the long history of fraudulent artwor


In [None]:
# Summarize contents of article

from transformers import pipeline

# Initialize the Hugging Face summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize the article text (split into chunks if too long)
max_length = 1024  # Max token length for BART
chunk_size = max_length - 20  # chunk size

# Split the article into chunks
chunks = [article_text[i:i + chunk_size] for i in range(0, len(article_text), chunk_size)]

# Summarize each chunk and combine the results
summary = ""
for chunk in chunks:
    summary += summarizer(chunk, max_length=220, min_length=20, do_sample=False)[0]['summary_text'] + "\n\n"

print(summary)

Device set to use cpu
Your max_length is set to 220, but your input_length is only 209. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=104)
Your max_length is set to 220, but your input_length is only 188. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Your max_length is set to 220, but your input_length is only 217. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=108)
Your max_length is set to 220, but your input_length is only 205. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarize

Recent discovery of an art forger's workshop in Rome reminds us of the long history of fraudulent artworks. When it comes to falsification and phoniness, there is indeed no new thing under the Sun.

Authorities confiscated more than 70 fraudulent artworks falsely attributed to notable artists. The suspect, who has yet to be apprehended, is thought to have used online platforms such as Catawiki and eBay to hawk their phoney wares.

Samson and Delilah was painted by Flemish master Peter Paul Rubens. It was purchased by the London museum in 1980 for £2.5m. The National Gallery stands by its attribution.

The National Gallery's Technical Bulletin in 1983 said the painting was of the highest aesthetic quality. The divergence of opinion between the museum's experts and those who doubt the work's authenticity opens a curious space in which to reflect on intriguing questions of artistic value and merit.

German art forger Wolfgang Beltracchi and his wife Helene were caught out in 2006. The pai

## User Story 3 - Text-based Sentiment Analysis for Social Media Monitoring
### Yelp Review Polarity Dataset

### Fine-tuning of an LLM for Sentiment Analysis

In [None]:
# Load Pre-trained model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Prepare dataset
from datasets import load_dataset
dataset = load_dataset("yelp_polarity")

# Select first 100 rows for training and 300 rows for validation
train_data = dataset['train'].select(range(1000)) # mini dataset
eval_data = dataset['test'].select(range(300))  # validation set


# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_data.map(tokenize_function, batched=True) # training data
tokenized_eval = eval_data.map(tokenize_function, batched=True) # test dataset

README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# Set up trainer
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./results", evaluation_strategy="epoch",
learning_rate=2e-5, per_device_train_batch_size=16,
num_train_epochs=10, weight_decay=0.01,
)
trainer = Trainer(
model=model, args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)

# Train model
trainer.train()

# Save fine-tune model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moma001[0m ([33moma001-santa-monica-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.203324
2,No log,0.145337
3,No log,0.293036
4,No log,0.255396
5,No log,0.267993
6,No log,0.28293
7,No log,0.282513
8,0.109400,0.329523
9,0.109400,0.323483
10,0.109400,0.324089


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [None]:
# Evaluate fine-tune model using metrics
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3240886926651001, 'eval_runtime': 4.0359, 'eval_samples_per_second': 74.332, 'eval_steps_per_second': 9.415, 'epoch': 10.0}


In [None]:
# Detail metrics using sklearn
from sklearn.metrics import classification_report
predictions = trainer.predict(tokenized_eval)
y_pred = predictions.predictions.argmax(axis=1)
y_true = tokenized_eval['label']
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       145
           1       0.96      0.92      0.94       155

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300

