![Alt Text](https://raw.githubusercontent.com/msfasha/307304-Data-Mining/main/20242/images/header.png)

<div style="display: flex; justify-content: flex-start; align-items: center;">
   <a href="https://colab.research.google.com/github/msfasha/307307-BI-Methods/blob/main/20242-NLP-LLM/Part%203%20-%20Introduction%20to%20DL%20and%20LLMs/1-Introduction%20to%20DL%20and%20LLMs.ipynb" target="_parent"><img 
   src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
</div>

### Context Aware Word Embeddings - BERT

In [None]:
%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\me\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


### Display BERT Embeddings

In [3]:
from transformers import BertTokenizer, BertModel
import torch

# Load pretrained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sentence
sentence = "He went to the bank to deposit money."

# Tokenize
inputs = tokenizer(sentence, return_tensors='pt')
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# Get outputs
with torch.no_grad():
    outputs = model(**inputs)

# Get hidden states (embeddings)
embeddings = outputs.last_hidden_state.squeeze(0)  # shape: (seq_len, hidden_size)

# Find index of "bank"
try:
    idx = tokens.index("bank")
    bank_embedding = embeddings[idx]
    print(f"Embedding for 'bank':\n{bank_embedding}\n\nShape: {bank_embedding.shape}")
except ValueError:
    print("'bank' not found in tokenized input:", tokens)


Embedding for 'bank':
tensor([ 4.7019e-01, -1.9835e-01, -1.0122e-01, -1.3519e-01,  1.2612e+00,
        -9.6139e-03, -4.9014e-02,  1.0147e+00, -4.5361e-02,  1.7432e-01,
         1.2800e-01, -3.2356e-01, -1.3227e-01,  3.6582e-02, -7.8302e-01,
        -6.2770e-01,  5.2776e-01,  3.5693e-01,  1.3597e+00,  2.3784e-01,
        -3.0995e-01,  4.3136e-02,  3.2358e-01,  3.2144e-01,  3.3207e-01,
         4.5470e-01,  6.8660e-01,  5.2037e-01, -2.8076e-01, -5.2107e-01,
         5.3412e-01,  9.5313e-01,  3.6960e-01,  4.9074e-01,  1.0348e-01,
        -1.2543e-01,  1.8115e-01,  3.9604e-02, -1.1310e+00,  2.2161e-02,
        -4.4877e-01, -8.1382e-01, -6.2421e-01,  3.5284e-01, -2.4929e-01,
        -6.1539e-01,  1.9276e-01,  2.8171e-01, -7.0082e-01, -8.2422e-01,
        -3.0416e-01,  1.0278e+00,  4.3732e-01, -5.0054e-01,  1.1097e-01,
         4.7545e-01, -1.0476e+00, -4.6538e-01, -5.3300e-01, -2.1977e-01,
         7.0954e-01,  3.1443e-01,  5.0420e-01, -7.7659e-01,  2.3119e-01,
        -1.6568e-01,  4.8205e

#### Use BERT to Create Context-Aware Word Embeddings

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# Load pretrained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract contextual embedding for a word (handles subwords)
def get_token_embedding(sentence, target_word):
    inputs = tokenizer(sentence, return_tensors='pt')
    outputs = model(**inputs)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    embeddings = outputs.last_hidden_state.squeeze(0)

    # Tokenize the target word the same way BERT does
    target_tokens = tokenizer.tokenize(target_word)

    # Search for the position of the target word (handling subwords)
    matches = []
    for i in range(len(tokens) - len(target_tokens) + 1):
        if tokens[i:i + len(target_tokens)] == target_tokens:
            matches = list(range(i, i + len(target_tokens)))
            break

    if not matches:
        raise ValueError(f"'{target_word}' not found in tokens: {tokens}")

    # Average the embeddings over all subword tokens
    return embeddings[matches].mean(dim=0)

# Contextual sentences
sentence_fruit = "He ate a fresh apple and enjoyed the fruit."
sentence_company = "Apple released a new product in the computer market."
sentence_orange = "An orange is a juicy fruit."
sentence_microsoft = "Microsoft computer was running the latest software."

# Get embeddings
apple_fruit = get_token_embedding(sentence_fruit, "apple")
apple_company = get_token_embedding(sentence_company, "apple")
orange = get_token_embedding(sentence_orange, "orange")
microsoft = get_token_embedding(sentence_microsoft, "Microsoft")

# Cosine similarity comparisons
sim_fruit = F.cosine_similarity(apple_fruit, orange, dim=0)
sim_company = F.cosine_similarity(apple_company, microsoft, dim=0)

# Results
print(f"Similarity between 'apple' (fruit) and 'orange': {sim_fruit.item():.4f}")
print(f"Similarity between 'apple' (company) and 'Microsoft': {sim_company.item():.4f}")

Similarity between 'apple' (fruit) and 'orange': 0.5839
Similarity between 'apple' (company) and 'Microsoft': 0.8549


### Use BERT to Create Questions Answering Applications - Pipeline Approach

In [1]:
# Import required libraries 
from transformers import AutoTokenizer, AutoModelForQuestionAnswering 
from transformers import pipeline 
import torch 

# Using pipeline (High-level API) 
qa_pipeline = pipeline( "question-answering",
model="bert-large-uncased-whole-word-masking-finetuned-squad",
tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad" ) 

# Example usage 
context = """ BERT is a method of pre-training language representations, 
meaning that it trains a general-purpose language understanding 
model on a large text corpus (like Wikipedia), 
and then uses that model for downstream NLP tasks like question answering. """ 

question = "What is BERT?" 
result = qa_pipeline(question=question, context=context) 
print(f"Answer: {result['answer']}") 
print(f"Confidence: {result['score']:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This 

Answer: a method of pre-training language representations
Confidence: 0.6874


### BERT Fine Tuning

In [None]:
# Required installations (uncomment if not already installed)
# !pip install transformers datasets scikit-learn

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load your CSV file (replace with your actual path)
df = pd.read_csv("amazon_reviews.csv")  # Columns: 'title', 'content', 'label'

# Combine title and content for input
df["text"] = df["title"] + " " + df["content"]
df = df[["text", "label"]]

# Split into train and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load BERT model for binary classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
)

# Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start fine-tuning
trainer.train()
# Save the model
trainer.save_model("fine_tuned_bert_amazon_reviews")