### Imports

In [37]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy
import swifter
import faiss
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration, DPRQuestionEncoderTokenizer
from datasets import load_dataset
from statistics import median
from datasets import Dataset, load_from_disk


nltk.download('popular')
spacy.cli.download("en_core_web_sm")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Michal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Michal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Michal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Michal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Michal\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_dat

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Load data

In [12]:
# Load the dataset
df = pd.read_csv('medium.csv')

## Preprocess data

In [13]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Process the text through spaCy NLP pipeline
    doc = nlp(text)
    processed_tokens = []
    
    for token in doc:
        # Preserve named entities as they are
        if token.ent_type_:
            processed_tokens.append(token.text)
        # Preserve nouns and certain POS tags, exclude stopwords and punctuation
        elif token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop and not token.is_punct:
            processed_tokens.append(token.text)
        # Apply lemmatization and lowercasing to other tokens
        else:
            if not token.is_stop and not token.is_punct:
                processed_tokens.append(token.lemma_.lower())
    
    return ' '.join(processed_tokens)


## Chunking articles

In [14]:
def chunk_article(text, max_chunk_length=512):
    """
    Splits the article into manageable chunks, each not exceeding the specified maximum length.
    
    Args:
    text (str): The article text to be chunked.
    max_chunk_length (int): The maximum allowed length of each chunk.
    
    Returns:
    list: A list of text chunks.
    """
    # Split the article into paragraphs
    paragraphs = text.split('\n\n')
    
    # Further split into sentences if needed, based on the heuristic like length
    chunks = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(sentence)
            
            # Handle the case where a single sentence is longer than the max chunk length
            if sentence_length > max_chunk_length:
                if current_chunk:  # If the current chunk is not empty, add it to the chunks list
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_length = 0
                # Here you could further split the sentence or truncate it to fit the max length
                # For simplicity, we'll add the long sentence as its own chunk
                chunks.append(sentence)
                continue
            
            if current_length + sentence_length > max_chunk_length:
                # If this sentence would exceed the max length, add the current chunk first
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length
        
        # After processing all sentences in a paragraph, add the remaining current chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))
    
    return chunks

### Analyze text distribution

In [15]:
# get text distribution to determine the max_chunk_length
def analyze_text_distribution(texts):
    sentence_lengths = []
    word_lengths = []
    
    for doc in nlp.pipe(texts, disable=["ner", "tagger"]):
        sentences = list(doc.sents)
        sentence_lengths.append(len(sentences))
        words_per_sentence = [len(sentence) for sentence in sentences]
        word_lengths.extend(words_per_sentence)
    
    return sentence_lengths, word_lengths

sentence_lengths, word_lengths = analyze_text_distribution(df['Text'])

print(f"Median Sentence Count: {median(sentence_lengths)}")
print(f"Median Word Count per Sentence: {median(word_lengths)}")



Median Sentence Count: 28
Median Word Count per Sentence: 20


In [44]:
MAX_CHUNK_LENGTH = 20 * 28  # ~28 sentences per chunk

### Apply preprocessing and chunking

In [17]:
# apply the chunk_article function to the DataFrame
df['processed_text'] = df['Text'].swifter.apply(preprocess_text)
df['original_chunks'] = df['Text'].swifter.apply(lambda text: chunk_article(text, MAX_CHUNK_LENGTH))
df['preprocessed_chunks'] = df['processed_text'].swifter.apply(lambda text: chunk_article(text, MAX_CHUNK_LENGTH))

Pandas Apply: 100%|██████████| 1391/1391 [06:45<00:00,  3.43it/s]
Pandas Apply: 100%|██████████| 1391/1391 [00:03<00:00, 348.04it/s]
Pandas Apply: 100%|██████████| 1391/1391 [00:01<00:00, 940.27it/s]


In [18]:
df.head()

Unnamed: 0,Title,Text,processed_text,original_chunks,preprocessed_chunks
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,1 Introduction Word2vec \n\n Word2vec popular ...,"[1. Introduction of Word2vec, Word2vec is one ...","[1 Introduction Word2vec, Word2vec popular te..."
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...",article introduce concept Graph Neural Network...,"[In my last article, I introduced the concept ...",[article introduce concept Graph Neural Networ...
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,Introduction \n\n Thanks strict implementation...,"[Introduction, Thanks to its strict implementa...","[Introduction, Thanks strict implementation g..."
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,Photo credit Mika Baumeister Unsplash \n\n wor...,[Photo credit to Mika Baumeister from Unsplash...,"[Photo credit Mika Baumeister Unsplash, work ..."
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,Step Step Implementation Gradient Descent Back...,[A Step-by-Step Implementation of Gradient Des...,[Step Step Implementation Gradient Descent Bac...


## Embedding generation

In [19]:
# Load a pre-trained model
em_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example: Generating embeddings for preprocessed chunks
df['embeddings'] = df['preprocessed_chunks'].swifter.apply(lambda chunks: np.array(em_model.encode(chunks)))

Pandas Apply: 100%|██████████| 1391/1391 [24:02<00:00,  1.04s/it]   


In [20]:
# Assuming `df['embeddings']` is where your embeddings are stored
sample_embedding = df['embeddings'].iloc[0]

print("Type of the embedding:", type(sample_embedding))
print("Shape of the embedding:", sample_embedding.shape)


Type of the embedding: <class 'numpy.ndarray'>
Shape of the embedding: (123, 384)


In [21]:
# Flatten embeddings if they are nested
all_embeddings = np.vstack(df['embeddings'].tolist())

# Check the new shape of all embeddings
print("New shape of all embeddings:", all_embeddings.shape)


New shape of all embeddings: (49227, 384)


In [22]:
print("Shape of all_embeddings:", all_embeddings.shape)
if np.isnan(all_embeddings).any():
    print("Warning: NaN values found in embeddings.")
else:
    print("Embeddings look good!")


Shape of all_embeddings: (49227, 384)
Embeddings look good!


In [23]:
df.head()

Unnamed: 0,Title,Text,processed_text,original_chunks,preprocessed_chunks,embeddings
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,1 Introduction Word2vec \n\n Word2vec popular ...,"[1. Introduction of Word2vec, Word2vec is one ...","[1 Introduction Word2vec, Word2vec popular te...","[[-0.05949667, -0.064553834, 0.014287822, 0.03..."
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...",article introduce concept Graph Neural Network...,"[In my last article, I introduced the concept ...",[article introduce concept Graph Neural Networ...,"[[-0.08080695, -0.051156927, -0.0053151087, -0..."
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,Introduction \n\n Thanks strict implementation...,"[Introduction, Thanks to its strict implementa...","[Introduction, Thanks strict implementation g...","[[-0.045066487, 0.058423676, -0.023994647, 0.0..."
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,Photo credit Mika Baumeister Unsplash \n\n wor...,[Photo credit to Mika Baumeister from Unsplash...,"[Photo credit Mika Baumeister Unsplash, work ...","[[-0.12282324, 0.079836145, 0.009691512, 0.071..."
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,Step Step Implementation Gradient Descent Back...,[A Step-by-Step Implementation of Gradient Des...,[Step Step Implementation Gradient Descent Bac...,"[[-0.10204862, 0.010278484, -0.00064166903, -0..."


## FAISS indexing

In [24]:
d = 384  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(d)  # Using L2 distance for the similarity measure

# Assuming `all_embeddings` is your numpy array of shape (49227, 384)
index.add(all_embeddings)  # Add your embeddings to the index

# Optionally, save the index to disk for later use
faiss.write_index(index, "index.faiss")


In [25]:
df.head()

Unnamed: 0,Title,Text,processed_text,original_chunks,preprocessed_chunks,embeddings
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,1 Introduction Word2vec \n\n Word2vec popular ...,"[1. Introduction of Word2vec, Word2vec is one ...","[1 Introduction Word2vec, Word2vec popular te...","[[-0.05949667, -0.064553834, 0.014287822, 0.03..."
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...",article introduce concept Graph Neural Network...,"[In my last article, I introduced the concept ...",[article introduce concept Graph Neural Networ...,"[[-0.08080695, -0.051156927, -0.0053151087, -0..."
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,Introduction \n\n Thanks strict implementation...,"[Introduction, Thanks to its strict implementa...","[Introduction, Thanks strict implementation g...","[[-0.045066487, 0.058423676, -0.023994647, 0.0..."
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,Photo credit Mika Baumeister Unsplash \n\n wor...,[Photo credit to Mika Baumeister from Unsplash...,"[Photo credit Mika Baumeister Unsplash, work ...","[[-0.12282324, 0.079836145, 0.009691512, 0.071..."
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,Step Step Implementation Gradient Descent Back...,[A Step-by-Step Implementation of Gradient Des...,[Step Step Implementation Gradient Descent Bac...,"[[-0.10204862, 0.010278484, -0.00064166903, -0..."


## Create dataset

In [27]:
# Prepare a list to hold your new rows including flattened chunks and embeddings
new_rows = []

# Iterate over each row in your original DataFrame
for _, row in df.iterrows():
    for chunk, embedding in zip(row['original_chunks'], row['embeddings']):
        # Create a new row for each chunk, including the title, chunk text (as 'text'), and embedding
        new_rows.append({'title': row['Title'], 'text': chunk, 'embeddings': embedding})

# Convert the list of new rows into a DataFrame
flattened_df = pd.DataFrame(new_rows)

# Convert the flattened DataFrame into a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(flattened_df)

# Optionally, save this dataset to disk for later use or inspection
hf_dataset.save_to_disk('./dataset')


Saving the dataset (1/1 shards): 100%|██████████| 49227/49227 [00:00<00:00, 666566.60 examples/s]


In [28]:
flattened_df.head()

Unnamed: 0,title,text,embeddings
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec,"[-0.05949667, -0.064553834, 0.014287822, 0.031..."
1,A Beginner’s Guide to Word Embedding with Gens...,Word2vec is one of the most popular technique ...,"[-0.012309051, -0.12891136, -0.02354605, 0.033..."
2,A Beginner’s Guide to Word Embedding with Gens...,Word embedding via word2vec can make natural l...,"[-0.010934148, -0.051672615, -0.05682393, 0.00..."
3,A Beginner’s Guide to Word Embedding with Gens...,"For instance, the words women, men, and human ...","[-0.09067625, 0.012690775, -0.06001801, -0.014..."
4,A Beginner’s Guide to Word Embedding with Gens...,There are two main training algorithms for wor...,"[-0.008359772, -0.013869865, -0.01232508, 0.00..."


In [29]:
# Save the DataFrame to a CSV file for RAG's use
flattened_df.to_csv('rag_dataset.csv', index=False)

In [30]:
# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(flattened_df)

# Save the dataset to disk for RAG to use
dataset.save_to_disk("./dataset")

Saving the dataset (1/1 shards): 100%|██████████| 49227/49227 [00:00<00:00, 769413.95 examples/s]


## Instantiate the model

In [31]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")

# Initialize the question tokenizer
dpr_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Initialize the retriever
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="custom",
    passages_path="./dataset",
    index_path="index.faiss"
)

# Initialize the RAG model
model = RagSequenceForGeneration.from_pretrained(
    "facebook/rag-token-nq",
    retriever=retriever
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

## Retrieval function

In [35]:
# Make sure to load your spaCy model
nlp = spacy.load("en_core_web_sm")

# Your preprocessing function
def preprocess_text(text):
    doc = nlp(text)
    processed_tokens = []
    for token in doc:
        if token.ent_type_:
            processed_tokens.append(token.text)
        elif token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop and not token.is_punct:
            processed_tokens.append(token.text)
        else:
            if not token.is_stop and not token.is_punct:
                processed_tokens.append(token.lemma_.lower())
    return ' '.join(processed_tokens)

# Your embedding model
em_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to preprocess and embed the query
def get_query_embedding(query):
    processed_query = preprocess_text(query)
    return em_model.encode([processed_query])[0]  # Encode returns a list of embeddings

# Load your FAISS index
faiss_index = faiss.read_index("index.faiss")

# Load your Hugging Face dataset
hf_dataset = load_from_disk("./dataset")

# Function to search the index with the query embedding
def search(query, k=5):
    query_embedding = get_query_embedding(query)
    distances, indices = faiss_index.search(np.array([query_embedding]).astype("float32"), k)
    return distances, indices

# Function to retrieve chunks from the dataset
def get_retrieved_chunks(indices):
    return [hf_dataset[int(idx)] for idx in indices[0]]

In [42]:
def display_results(query, num_results=5, combine_chunks=True, max_combined_length=1024):
    distances, indices = search(query, k=num_results)
    retrieved_data = get_retrieved_chunks(indices)
    
    print(f"Query: {query}\n")
    for i, data in enumerate(retrieved_data):
        combined_text = data['text']
        next_idx = i + 1
        
        # Combine chunks from the same article if they are consecutive
        while combine_chunks and next_idx < len(retrieved_data) and len(combined_text) < max_combined_length:
            if retrieved_data[next_idx]['title'] == data['title']:
                combined_text += ' ' + retrieved_data[next_idx]['text']
                next_idx += 1
            else:
                break
        
        print(f"Result {i+1}: (Score: {1 - distances[0][i]:.4f})")
        print(f"Title: {data['title']}\nText: {combined_text}\n")
        if next_idx > i + 1:
            break  # Break the loop if we have combined chunks so we don't print the same chunks again

# Using the model

In [43]:
# Example usage
query = "Machine Learning"
number_of_articles_to_retrieve = 3
display_results(query, number_of_articles_to_retrieve, combine_chunks=True, max_combined_length=1024)

Query: Machine Learning

Result 1: (Score: 1.0000)
Title: On the Journey to Machine Learning / AI
Text: What is Machine Learning?

Result 2: (Score: 1.0000)
Title: So what is Machine Learning?
Text: You can easily find many popular use-cases of Machine Learning. I am sure you check Amazon for when you need to buy new clothes or shoes. And then you see a list of recommended items for you. This is, in fact, machine learning at play.

Result 3: (Score: 1.0000)
Title: Practical Machine Learning with C++ and GRT
Text: What is machine learning?



# Debugging

In [45]:
import nbformat

# Load the notebook
notebook_path = 'solution.ipynb'
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
    nb_content = nbformat.read(nb_file, as_version=4)

# Extract and print the code cells' content for review
for cell in nb_content['cells']:
    if cell['cell_type'] == 'code':
        print(cell['source'])

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy
import swifter
import faiss
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration, DPRQuestionEncoderTokenizer
from datasets import load_dataset
from statistics import median
from datasets import Dataset, load_from_disk


nltk.download('popular')
spacy.cli.download("en_core_web_sm")
# Load the dataset
df = pd.read_csv('medium.csv')
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Process the text through spaCy NLP pipeline
    doc = nlp(text)
    processed_tokens = []
    
    for token in doc:
        # Preserve named entities as they are
        if token.ent_type_:
            processed_tokens.append(token.text)
        # Preserve nouns and certain POS tags, exclude stopwords and punctuation
        elif token.pos_ in ['NOUN', 'PROPN'] and not token.is_st