<!-- ![Alt Text](https://raw.githubusercontent.com/msfasha/307304-Data-Mining/main/images/header.png) -->

<div style="display: flex; justify-content: flex-start; align-items: center;">
   <a href="https://colab.research.google.com/github/msfasha/307307-BI-Methods-Generative-AI/blob/main/20251/Module%205%20-%20Intro%20to%20Tansformers%20and%20Context%20Aware%20Embeddings/Context%20Aware%20Embeddings.ipynb" target="_parent">   
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
</div>

### Context Aware Word Embeddings - BERT

In [None]:
%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\me\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


### Display BERT Embeddings

#### Use BERT to Create Context-Aware Word Embeddings
Compare Apple company to Apple fruit and Microsoft company

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# Load pretrained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract contextual embedding for a word (handles subwords)
def get_token_embedding(sentence, target_word):
    # Tokenize the sentence and get embeddings
    inputs = tokenizer(sentence, return_tensors='pt')
    outputs = model(**inputs)

    # Get tokens and embeddings
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    embeddings = outputs.last_hidden_state.squeeze(0)

    # Tokenize the target word the same way BERT does
    target_tokens = tokenizer.tokenize(target_word)

    # Search for the position of the target word (handling subwords)
    matches = []
    for i in range(len(tokens) - len(target_tokens) + 1):
        if tokens[i:i + len(target_tokens)] == target_tokens:
            matches = list(range(i, i + len(target_tokens)))
            break

    if not matches:
        raise ValueError(f"'{target_word}' not found in tokens: {tokens}")

    # Average the embeddings over all subword tokens
    return embeddings[matches].mean(dim=0)

# Contextual sentences
sentence_fruit = "He ate a fresh apple and enjoyed the fruit."
sentence_company = "Apple released a new product in the computer market."
sentence_orange = "An orange is a juicy fruit."
sentence_microsoft = "Microsoft computer was running the latest software."

# Get embeddings
apple_fruit = get_token_embedding(sentence_fruit, "apple")
apple_company = get_token_embedding(sentence_company, "apple")
orange = get_token_embedding(sentence_orange, "orange")
microsoft = get_token_embedding(sentence_microsoft, "Microsoft")

# Cosine similarity comparisons
sim_fruit = F.cosine_similarity(apple_fruit, orange, dim=0)
sim_company = F.cosine_similarity(apple_company, microsoft, dim=0)

# Results
print(f"Similarity between 'apple' (fruit) and 'orange': {sim_fruit.item():.4f}")
print(f"Similarity between 'apple' (company) and 'Microsoft': {sim_company.item():.4f}")

Similarity between 'apple' (fruit) and 'orange': 0.5839
Similarity between 'apple' (company) and 'Microsoft': 0.8549


# NLP Pipelines

Basic Pipeline Usage
1. Text Classification (Sentiment Analysis)

In [None]:
from transformers import pipeline

# Create a sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")

# Analyze single text
result = classifier("I love using Hugging Face!")
print(result)
# Output: [{'label': 'POSITIVE', 'score': 0.9998}]

# Analyze multiple texts
texts = [
    "I hate this product",
    "This is amazing!",
    "It's okay, nothing special"
]
results = classifier(texts)
for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")

2. Named Entity Recognition (NER)

In [None]:
# NER pipeline
ner = pipeline("ner", aggregation_strategy="simple")

text = "My name is John and I live in New York. I work at Google."
entities = ner(text)

for entity in entities:
    print(f"Entity: {entity['word']}")
    print(f"Label: {entity['entity_group']}")
    print(f"Score: {entity['score']:.4f}")
    print(f"Start: {entity['start']}, End: {entity['end']}\n")

3. Question Answering

In [None]:
# Question answering pipeline
qa = pipeline("question-answering")

context = """
Hugging Face is a company that develops tools for building applications using machine learning. 
They are especially known for their work in natural language processing. The company was founded in 2016 
and is headquartered in New York.
"""

questions = [
    "When was Hugging Face founded?",
    "Where is Hugging Face headquartered?",
    "What is Hugging Face known for?"
]

for question in questions:
    result = qa(question=question, context=context)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']:.4f}\n")

4. Text Generation

In [None]:
# Text generation pipeline
generator = pipeline("text-generation", model="gpt2")

# Generate text with custom parameters
prompts = [
    "The future of artificial intelligence is",
    "In a world where robots exist,"
]

for prompt in prompts:
    generated = generator(
        prompt,
        max_length=50,
        num_return_sequences=2,
        temperature=0.7,
        do_sample=True,
        pad_token_id=generator.tokenizer.eos_token_id
    )
    
    print(f"Prompt: {prompt}")
    for i, gen in enumerate(generated):
        print(f"Generation {i+1}: {gen['generated_text']}\n")

5. Text Summarization

In [None]:
# Summarization pipeline
summarizer = pipeline("summarization")

article = """
Machine learning is a subset of artificial intelligence that enables computers to learn and improve 
from experience without being explicitly programmed. It focuses on the development of computer programs 
that can access data and use it to learn for themselves. The process of learning begins with observations 
or data, such as examples, direct experience, or instruction, in order to look for patterns in data and 
make better decisions in the future based on the examples that we provide. The primary aim is to allow 
the computers to learn automatically without human intervention or assistance and adjust actions accordingly.
"""

summary = summarizer(article, max_length=50, min_length=25, do_sample=False)
print("Original length:", len(article.split()))
print("Summary:", summary[0]['summary_text'])
print("Summary length:", len(summary[0]['summary_text'].split()))

6. Translation

In [None]:
# Translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")

texts = [
    "Hello, how are you today?",
    "Machine learning is fascinating.",
    "I would like to order a coffee."
]

for text in texts:
    translated = translator(text)
    print(f"English: {text}")
    print(f"French: {translated[0]['translation_text']}\n")

#### Use specific model e.g. BERT to Create Questions Answering Pipeline

In [1]:
# Import required libraries 
from transformers import AutoTokenizer, AutoModelForQuestionAnswering 
from transformers import pipeline 
import torch 

# Using pipeline (High-level API) 
qa_pipeline = pipeline( "question-answering",
model="bert-large-uncased-whole-word-masking-finetuned-squad",
tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad" ) 

# Example usage 
context = """ BERT is a method of pre-training language representations, 
meaning that it trains a general-purpose language understanding 
model on a large text corpus (like Wikipedia), 
and then uses that model for downstream NLP tasks like question answering. """ 

question = "What is BERT?" 
result = qa_pipeline(question=question, context=context) 
print(f"Answer: {result['answer']}") 
print(f"Confidence: {result['score']:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This 

Answer: a method of pre-training language representations
Confidence: 0.6874


---