In [1]:
import torch
import pandas as pd
import numpy as np

#### Text preprocessing

In [2]:
data = pd.read_csv("../data/medium.csv")
data.head()

Unnamed: 0,Title,Text
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o..."
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...


In [3]:
def preprocess_text(data: pd.DataFrame) -> pd.DataFrame:
    data["Text"] = data["Text"].str.replace("\n", " ")
    data["Text"] = data["Text"].str.replace("\t", " ")
    data["Text"] = data["Text"].str.replace("\r", " ")
    data["Text"] = data["Text"].str.replace(r"\s+", " ", regex=True)
    data["Text"] = data["Text"].str.strip()

    return data


def sample_random_texts(data: pd.DataFrame, n=5) -> None:
    for text in data.sample(n)["Text"].values:
        print(text)
        print("\n\n")
        
        

In [4]:
data = preprocess_text(data)
data.head()

Unnamed: 0,Title,Text
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec Word2vec is one of...
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o..."
2,How to Use ggplot2 in Python,Introduction Thanks to its strict implementati...
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash ...
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...


#### Feature engineering for text data EDA

In [5]:
from spacy.lang.en import English
from tqdm import tqdm
tqdm.pandas()

nlp = English()
nlp.add_pipe("sentencizer")

data["text_words_num"] = data["Text"].apply(lambda x: len(x.split()))
data["text_length"] = data["Text"].apply(lambda x: len(x))
data["text_sentences_num"] = data["Text"].progress_apply(lambda x: len(list(nlp(x).sents)))
data["token_count"] = data["Text"].apply(lambda x: len(x) / 4)
data.head()

100%|██████████| 1391/1391 [00:03<00:00, 398.46it/s]


Unnamed: 0,Title,Text,text_words_num,text_length,text_sentences_num,token_count
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec Word2vec is one of...,1489,10432,64,2608.0
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...",139,827,7,206.75
2,How to Use ggplot2 in Python,Introduction Thanks to its strict implementati...,953,5632,45,1408.0
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash ...,280,1776,16,444.0
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,737,4744,28,1186.0


In [6]:
data.describe().round(2)

Unnamed: 0,text_words_num,text_length,text_sentences_num,token_count
count,1391.0,1391.0,1391.0,1391.0
mean,901.54,5530.81,44.46,1382.7
std,885.73,5521.48,44.17,1380.37
min,49.0,249.0,2.0,62.25
25%,315.0,1886.0,15.0,471.5
50%,516.0,3040.0,26.0,760.0
75%,1227.0,7577.5,60.0,1894.38
max,7657.0,46966.0,376.0,11741.5


#### Chunking articles into smaller sentences

In [7]:
for _, row in data.sample(5).iterrows():
    doc = nlp(row["Text"])
    sents = [sent.text for sent in doc.sents]
    print(sents)
    print("\n\n")

['Introduction Have you ever tried to use a movie recommender?', 'In theory, it is something useful that can help figure out what to watch next instead of browsing through Netflix for a few hours, but their results tend to be hit-or-miss.', 'This is a problem that most people can relate to, so I decided to create a homemade recommender system myself and share it in this blog post.', 'I will show you how to create 3 simple recommender models from scratch that accept a movie as input and return the “n” most similar movies as output, with “n” being provided by the user.', 'In general, recommender systems are either content-based or collaborative with the user’s history and interests.', 'I chose to create content-based models since they make predictions based on the specific input item (movie) and not based on the user.', 'Note that the recommendations for this blog are based entirely off of movie keywords.', 'As you will see, it is fairly simple to use other text information, such as plot

In [8]:
n_sentences_in_chunk = 10
CONTEXT_WINDOW = 384
TOKEN_SIZE = 4


def split_text_into_chunks(text: str, n_sentences_in_chunk: int, overlap=0) -> list:
    doc = nlp(text)
    sents = [sent.text for sent in doc.sents]
    chunks = []
    n_sentences_in_chunk = n_sentences_in_chunk - overlap
    remainder = 0
    for i in range(overlap, len(sents), n_sentences_in_chunk):
        if remainder == n_sentences_in_chunk:
            remainder = 0
        chunk = sents[i-overlap-remainder:i + n_sentences_in_chunk-remainder]
        remainder = 0
        while len(" ".join(chunk)) / TOKEN_SIZE > CONTEXT_WINDOW:
            remainder += 1
            chunk = chunk[:-remainder]
            
        chunk = " ".join(chunk)
        chunks.append(chunk)
    return chunks

In [9]:
dict_data = data.to_dict("records")  

for elem in tqdm(dict_data):
    elem["chunks"] = split_text_into_chunks(elem["Text"], n_sentences_in_chunk, overlap=1)

100%|██████████| 1391/1391 [00:02<00:00, 500.85it/s]


#### Example of chunking

In [10]:
for chunk in dict_data[0]["chunks"]:
    print(len(chunk) / 4)
    print(chunk)
    print("\n\n")

285.0
1. Introduction of Word2vec Word2vec is one of the most popular technique to learn word embeddings using a two-layer neural network. Its input is a text corpus and its output is a set of vectors. Word embedding via word2vec can make natural language computer-readable, then further implementation of mathematical operations on words can be used to detect their similarities. A well-trained set of word vectors will place similar words close to each other in that space. For instance, the words women, men, and human might cluster in one corner, while yellow, red and blue cluster together in another. There are two main training algorithms for word2vec, one is the continuous bag of words(CBOW), another is called skip-gram. The major difference between these two methods is that CBOW is using context to predict a target word while skip-gram is using a word to predict a target context. Generally, the skip-gram method can have a better performance compared with CBOW method, for it can captur

#### Creating a dataset with chunks

In [11]:
import re

chunks = pd.DataFrame(columns=["Title", "Chunk", "Chunk_length", "Chunk_words_num", "Chunk_sentences_num", "Token_count"])

for d in tqdm(dict_data):
    for chunk in d["chunks"]:
        chunk = chunk.strip()
        chunk = re.sub(r"\s+", " ", chunk)
        row = pd.DataFrame.from_dict(
            {
                "Title": d["Title"],
                "Chunk": chunk,
                "Chunk_length": len(chunk),
                "Chunk_words_num": len(chunk.split()),
                "Chunk_sentences_num": len(list(nlp(chunk).sents)),
                "Token_count": len(chunk) / 4
            },
            orient="index"
        )
        row = row.T
        chunks = pd.concat([chunks, row], axis=0)
        
numeric = ["Chunk_length", "Chunk_words_num", "Chunk_sentences_num", "Token_count"]
chunks[numeric] = chunks[numeric].astype(float)

chunks.head()

100%|██████████| 1391/1391 [00:03<00:00, 350.90it/s]


Unnamed: 0,Title,Chunk,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec Word2vec is one of...,1140.0,188.0,10.0,285.0
0,A Beginner’s Guide to Word Embedding with Gens...,"For instance, it will have two vector represen...",1174.0,194.0,10.0,293.5
0,A Beginner’s Guide to Word Embedding with Gens...,"pip install --upgrade gensim Or, alternatively...",1288.0,215.0,10.0,322.0
0,A Beginner’s Guide to Word Embedding with Gens...,"To be more specific, each make model is contai...",1364.0,204.0,4.0,341.0
0,A Beginner’s Guide to Word Embedding with Gens...,window: The maximum distance between a target ...,1438.0,185.0,10.0,359.5


In [12]:
chunks.sample(1)

Unnamed: 0,Title,Chunk,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count
0,Data analytics with MODIS data,Variation of Combined AOD and Cloud Fraction o...,1085.0,186.0,10.0,271.25


In [13]:
chunks.describe().astype(float).round(2)

Unnamed: 0,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count
count,7334.0,7334.0,7334.0,7334.0
mean,1052.25,173.26,8.8,263.06
std,331.35,54.58,2.16,82.84
min,0.0,0.0,0.0,0.0
25%,865.0,144.0,9.0,216.25
50%,1100.0,182.0,10.0,275.0
75%,1313.0,213.0,10.0,328.25
max,1536.0,291.0,10.0,384.0


In [14]:
chunks = chunks[chunks["Token_count"] > 0]

In [15]:
chunks.describe()

Unnamed: 0,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count
count,7292.0,7292.0,7292.0,7292.0
mean,1058.311437,174.255623,8.854361,264.577859
std,322.506653,53.123813,2.061306,80.626663
min,9.0,2.0,1.0,2.25
25%,871.0,145.0,9.0,217.75
50%,1101.0,182.0,10.0,275.25
75%,1314.0,213.25,10.0,328.5
max,1536.0,291.0,10.0,384.0


#### Printing some chunks with low token count

In [16]:
MIN_TOKENS = 50

for ch in chunks[chunks["Token_count"] < 50].sample(5)["Chunk"].values:
    print(ch)
    print("\n\n")



If your course if making some of these mistakes you might be able to help save your education by encouraging some better practices.) Part 1 — Why Learning…



That day is coming. Trust me, I’m human. This post was adapted from my book: Trust Me, I’m A Bot — Building Digital Trust Using Conversational AI



After this, you just have to feed them and wait for finishing the learning. Thanks for reading, see you in the next article. — — — — — References



Also, follow me on Twitter and LinkedIn. Cheers!



This is sometimes called the ‘Bell Curve’ or the ‘Gaussian Curve’. A simple way to do this is to determine the normality of each variable separately using the Shapiro-Wilk Test.



As we can see most of these short sentences come from the end of the articles. We can safely remove them from the dataset as they bring little information to the table.

In [17]:
print(chunks.shape)
chunks = chunks[chunks["Token_count"] > MIN_TOKENS]
print(chunks.shape)

(7292, 6)
(7185, 6)


#### Embedding the chunks

Model test

In [18]:
from sentence_transformers import SentenceTransformer
import numpy as np

embeddings_model = SentenceTransformer("all-mpnet-base-v2")

sentences = ["This is a sample sentence", "I like to eat apples"]
embeddings = embeddings_model.encode(sentences)
cos_sim = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
cos_sim

0.1260562

#### Enabling CUDA if possible

In [19]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(device)

mps


In [20]:
embeddings_model.to(device)
chunks_dict = chunks.to_dict("records")

for chunk in tqdm(chunks_dict):
    embedding = embeddings_model.encode(chunk["Chunk"])
    chunk["Embedding"] = embedding

100%|██████████| 7185/7185 [13:08<00:00,  9.11it/s]   


In [21]:
chunks = pd.DataFrame(chunks_dict)
chunks

Unnamed: 0,Title,Chunk,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count,Embedding
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec Word2vec is one of...,1140.0,188.0,10.0,285.00,"[0.040338237, 0.012804469, -0.006745447, 0.042..."
1,A Beginner’s Guide to Word Embedding with Gens...,"For instance, it will have two vector represen...",1174.0,194.0,10.0,293.50,"[0.05746932, 0.0059202667, -0.01848462, 0.0432..."
2,A Beginner’s Guide to Word Embedding with Gens...,"pip install --upgrade gensim Or, alternatively...",1288.0,215.0,10.0,322.00,"[-0.008038099, 0.018741773, -0.010320838, 0.06..."
3,A Beginner’s Guide to Word Embedding with Gens...,"To be more specific, each make model is contai...",1364.0,204.0,4.0,341.00,"[0.028384276, 0.01910937, -0.023285013, 0.0194..."
4,A Beginner’s Guide to Word Embedding with Gens...,window: The maximum distance between a target ...,1438.0,185.0,10.0,359.50,"[0.028748535, -0.047673915, 0.012459073, 0.064..."
...,...,...,...,...,...,...,...
7180,Primer on The Importance of Mindful Data Colle...,"This is true of any research institution, even...",1387.0,227.0,10.0,346.75,"[0.04206838, 0.033875473, -0.034986444, -0.038..."
7181,Primer on The Importance of Mindful Data Colle...,The issues of differing standards can ultimate...,1435.0,226.0,9.0,358.75,"[0.03998078, 0.11374257, -0.031096485, -0.0056..."
7182,Primer on The Importance of Mindful Data Colle...,The above excerpt comes from the abstract to t...,915.0,153.0,10.0,228.75,"[0.04979147, 0.06545234, -0.033590827, -0.0121..."
7183,Primer on The Importance of Mindful Data Colle...,Show me your final data. This is all extremely...,981.0,167.0,10.0,245.25,"[0.01828431, 0.087652445, -0.06963481, 0.00219..."


In [22]:
chunks.to_csv("../data/chunks_embedded.csv", index=False)

In [23]:
text_chunks = [chunk["Chunk"] for chunk in chunks_dict]
text_chunks[0]

'1. Introduction of Word2vec Word2vec is one of the most popular technique to learn word embeddings using a two-layer neural network. Its input is a text corpus and its output is a set of vectors. Word embedding via word2vec can make natural language computer-readable, then further implementation of mathematical operations on words can be used to detect their similarities. A well-trained set of word vectors will place similar words close to each other in that space. For instance, the words women, men, and human might cluster in one corner, while yellow, red and blue cluster together in another. There are two main training algorithms for word2vec, one is the continuous bag of words(CBOW), another is called skip-gram. The major difference between these two methods is that CBOW is using context to predict a target word while skip-gram is using a word to predict a target context. Generally, the skip-gram method can have a better performance compared with CBOW method, for it can capture two

In [24]:
text_embeddings = embeddings_model.encode(text_chunks,
                                          batch_size=32,
                                          show_progress_bar=True,
                                          convert_to_tensor=True)
text_embeddings

Batches:   0%|          | 0/225 [00:00<?, ?it/s]

tensor([[ 0.0403,  0.0128, -0.0067,  ..., -0.0049, -0.0416, -0.0356],
        [ 0.0575,  0.0059, -0.0185,  ..., -0.0489, -0.0511, -0.0437],
        [-0.0080,  0.0187, -0.0103,  ..., -0.0336, -0.0822, -0.0343],
        ...,
        [ 0.0498,  0.0655, -0.0336,  ..., -0.0047, -0.0249, -0.0174],
        [ 0.0183,  0.0877, -0.0696,  ...,  0.0583,  0.0100,  0.0260],
        [-0.0078,  0.1351, -0.0372,  ...,  0.0125, -0.0042,  0.0256]],
       device='mps:0')

In [25]:
torch.save(text_embeddings, f"../data/text_embeddings_{device}.pt")

### RAG pipeline 
(Checkpoint 1) - work above is saved in the data folder

In [26]:
import numpy as np
import pandas as pd
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

text_chunks_df = pd.read_csv("../data/chunks_embedded.csv")
text_chunks_dict = text_chunks_df.to_dict("records")

text_embeddings = torch.load(f"../data/text_embeddings_{device}.pt")
text_embeddings[0]

tensor([ 4.0338e-02,  1.2804e-02, -6.7454e-03,  4.2980e-02, -4.3603e-02,
         1.6660e-02, -3.8313e-02,  3.4725e-02, -4.9667e-03, -3.8526e-02,
         4.6374e-02, -5.1384e-02,  4.2376e-02,  1.3069e-02,  4.0606e-02,
        -1.1125e-02,  7.3396e-02,  3.7244e-02, -5.2825e-02, -3.8347e-03,
         1.3996e-02,  1.2569e-02, -1.7039e-03,  4.6136e-02, -3.2836e-02,
         1.1327e-02,  2.5366e-02, -2.0369e-02,  2.4939e-03, -1.9886e-03,
         2.1593e-02, -2.6125e-03,  8.8577e-03,  7.1654e-03,  1.8332e-06,
        -2.1929e-02, -2.9749e-02, -1.1231e-03,  4.3468e-02, -1.9777e-02,
         6.6936e-02, -1.2584e-02,  4.0857e-03,  2.1768e-02, -3.9587e-02,
        -3.9380e-02,  6.4695e-02,  8.3795e-02,  2.7839e-02,  2.4199e-02,
        -2.0639e-02, -8.7842e-02,  7.2766e-05, -1.5067e-02, -8.0752e-03,
         5.4452e-03,  4.4952e-02, -2.2253e-02, -5.6358e-03,  5.0621e-03,
        -4.8909e-02,  1.9777e-02, -4.4482e-02,  5.6928e-03,  4.2703e-02,
         7.4282e-02, -5.1439e-02, -3.6327e-02,  1.5

#### Query embedding

In [27]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-mpnet-base-v2", device=device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

#### Retrieval indexing

In [28]:
# query 
query = "What is machine learning?"

# embedding
query_embedding = model.encode(query, convert_to_tensor=True, device=device)

# similarity - dot because of a Normalize layer in the model
dot_products = util.dot_score(query_embedding, text_embeddings)

# best results
torch.topk(dot_products, k=5)

torch.return_types.topk(
values=tensor([[0.7937, 0.7876, 0.7609, 0.7162, 0.6967]], device='mps:0'),
indices=tensor([[1032,  393, 1982, 6746, 3390]], device='mps:0'))

In [29]:
text_chunks_df.iloc[torch.topk(dot_products, k=5).indices.cpu().numpy().ravel()]

Unnamed: 0,Title,Chunk,Chunk_length,Chunk_words_num,Chunk_sentences_num,Token_count,Embedding
1032,On the Journey to Machine Learning / AI,What is Machine Learning? There are millions o...,953.0,159.0,10.0,238.25,[ 2.16269530e-02 -2.55265869e-02 -6.68139756e-...
393,Microsoft Introduction to AI — Part 1,Learning from this data it can understand our ...,792.0,131.0,10.0,198.0,[ 2.41031330e-02 -5.80267282e-03 -4.96849753e-...
1982,So what is Machine Learning?,Photo by fabio on Unsplash I am sure by now yo...,1194.0,215.0,10.0,298.5,[ 3.63377593e-02 2.96800584e-02 -4.95566875e-...
6746,Why Machine Learning?,Image by the author In my previous post I talk...,1104.0,181.0,10.0,276.0,[ 3.04092318e-02 5.43406904e-02 -4.28669490e-...
3390,Machine Learning in Energy,What is machine learning? The business plans o...,1206.0,200.0,10.0,301.5,[ 6.98239403e-03 6.78827018e-02 -6.71653450e-...


In [30]:
text_embeddings.shape

torch.Size([7185, 768])

#### Helper functions for retrieval and printing

In [31]:
import textwrap


def retrieve_simillar_embeddings(query, embeddings, model=model, device=device, n=5):
    query_embedding = model.encode(query, convert_to_tensor=True, device=device)
    dot_products = util.dot_score(query_embedding, embeddings)
    top_results = torch.topk(dot_products, k=n)
    
    indices, scores = top_results
    
    return indices, scores


def search_text(query, embeddings, model=model, device=device, n=5):
    indices, scores = retrieve_simillar_embeddings(query, embeddings, model, device, n)
    indices = indices.cpu().numpy().ravel()
    scores = scores.cpu().numpy().ravel()
    
    results = text_chunks_df.iloc[indices]
    chunks = results["Chunk"].values
    titles = results["Title"].values
    
    return chunks, titles, scores
    
    
def print_text(chunks, titles, scores, query, width=100):
    print("Query:", query)
    print("=======")
    wrapper = textwrap.TextWrapper(width=width)

    for chunk, title, score in zip(chunks, titles, scores):
        print(f"Article title: {title} || Score: {score}\n")
        word_list = wrapper.wrap(text=chunk)
        for element in word_list:
            print(element)
        print("\n")
        print("=======")
        
query = "What is a convolutional neural net?"

chunks, titles, scores = search_text(query, text_embeddings)

print_text(chunks, titles, scores, query)


Query: What is a convolutional neural net?
Article title: A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model || Score: 2903

1. Introduction of Word2vec Word2vec is one of the most popular technique to learn word embeddings
using a two-layer neural network. Its input is a text corpus and its output is a set of vectors.
Word embedding via word2vec can make natural language computer-readable, then further implementation
of mathematical operations on words can be used to detect their similarities. A well-trained set of
word vectors will place similar words close to each other in that space. For instance, the words
women, men, and human might cluster in one corner, while yellow, red and blue cluster together in
another. There are two main training algorithms for word2vec, one is the continuous bag of
words(CBOW), another is called skip-gram. The major difference between these two methods is that
CBOW is using context to predict a target word while skip-gram is using a word to 

In [32]:
retrieve_simillar_embeddings("What is a convolutional neural net?", text_embeddings)

(tensor([[0.7434, 0.6638, 0.6285, 0.6077, 0.6074]], device='mps:0'),
 tensor([[2903, 6250, 7111, 2904,  691]], device='mps:0'))

### Local LLM generation

In [33]:
# model https://huggingface.co/google/gemma-2b-it
# huggingface-cli https://huggingface.co/docs/huggingface_hub/main/en/guides/cli

In [34]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# using 7b is possible if 24gb of VRAM on a GPU is available
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
llm = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",
                                           attn_implementation="sdpa",
                                           torch_dtype=torch.float16,
                                           low_cpu_mem_usage=False)

llm.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

In [35]:
model.device

device(type='mps', index=0)

In [36]:
def print_model_data(model, device):
    if device == torch.device("mps"):
        print(f"Used memory (MPS GPU): {(torch.mps.current_allocated_memory() / 1024 ** 3):.2f} GB", )
    print(f"model size: {(sum([p.numel() for p in model.parameters()]) / 1e9):.2f}B")
    
    mem_params = sum([p.nelement() * p.element_size() for p in model.parameters()])
    mem_buffers = sum([b.nelement() * b.element_size() for b in model.buffers()])
    
    print(f"Model memory: {(mem_params + mem_buffers) / 1e9:.2f} GB")
    print(f"Model parameters: {mem_params + mem_buffers} bytes")


print_model_data(llm, device)

Used memory (MPS GPU): 6.31 GB
model size: 2.51B
Model memory: 5.55 GB
Model parameters: 5549215744 bytes


#### Generation

In [37]:
input_text = "What is a convolutional neural net?"
print(input_text)

llm_prompt_template = [{
    "role": "user",
    "content": input_text,
}]

prompt = tokenizer.apply_chat_template(llm_prompt_template, tokenize=False, add_generation_prompt=True)
prompt

What is a convolutional neural net?


'<bos><start_of_turn>user\nWhat is a convolutional neural net?<end_of_turn>\n<start_of_turn>model\n'

In [38]:
tokenized_prompt = tokenizer(prompt, return_tensors="pt").to(device)

output = llm.generate(**tokenized_prompt, max_new_tokens=256)
print(tokenizer.decode(output[0], skip_special_tokens=True))

KeyboardInterrupt: 

The output from the model looks fine, let's prompt engineer it

In [None]:
questions = [
    "What is the difference between supervised and unsupervised learning in machine learning?",
    "Can you explain the bias-variance tradeoff in machine learning?",
    "What are some common activation functions used in neural networks?",
    "What is the purpose of regularization in machine learning models?",
    "How does gradient descent optimize the parameters of a machine learning model?",
    "What is cross-validation and why is it used in machine learning?",
    "Explain the concept of feature engineering in machine learning.",
    "What is the role of hyperparameters in machine learning algorithms?"
]


def create_prompt(query, context_chunks):
    query_start = "Answer the question: " + query
    answer_requirements = """
Give yourself room to think by extracting relevant passages from the context before answering.
Return just the answer to the question.
Make sure the answer is as explanatory as possible.
Use the following reference questions and answers as a style guideline but answer only to the question above:
1. What is overfitting in machine learning?
   - Overfitting occurs when a model learns to memorize the training data instead of capturing the underlying patterns, leading to poor generalization on unseen data.

2. What is the purpose of a validation set in machine learning?
   - The validation set is used to evaluate the performance of a model during training and to tune hyperparameters to prevent overfitting.

3. What is the difference between precision and recall in binary classification?
   - Precision measures the proportion of true positives among all predicted positives, while recall measures the proportion of true positives among all actual positives.

4. What is the softmax function used for in neural networks?
   - The softmax function is used to convert the raw output of a neural network into probabilities, enabling it to make multi-class predictions.

5. What is transfer learning in deep learning?
   - Transfer learning involves using pre-trained neural network models as a starting point for training on a new task, often resulting in faster convergence and better performance with less data.

6. What is batch normalization in neural networks?
   - Batch normalization is a technique used to normalize the inputs of each layer in a neural network, stabilizing training and accelerating convergence.

7. What is the purpose of the Adam optimizer in deep learning?
   - The Adam optimizer is an adaptive learning rate optimization algorithm that combines the advantages of both AdaGrad and RMSProp, making it widely used in training deep neural networks.

8. What is the curse of dimensionality in machine learning?
   - The curse of dimensionality refers to the increased difficulty of learning and generalizing from data in high-dimensional spaces, leading to sparsity and increased computational complexity.
    """

    query_end = "Based on the following context."
    context = "- " + "\n- ".join([chunk for chunk in context_chunks])

    prompt = "\n".join([query_start, answer_requirements, context, query_end])

    llm_prompt_template = [{
    "role": "user",
    "content": prompt,
    }]

    prompt = tokenizer.apply_chat_template(llm_prompt_template,
                                           tokenize=False,
                                           add_generation_prompt=True)

    return prompt

#### Test the prompt generation

In [None]:
query_idx = np.random.randint(0, len(questions))
query = questions[query_idx]

chunks, titles, scores = search_text(query, text_embeddings)
prompt = create_prompt(query, chunks)

print(prompt)

In [None]:
llm_prompt_template = [{
    "role": "user",
    "content": prompt,
}]

tokenized_prompt = tokenizer(prompt, return_tensors="pt").to(device)

output = llm.generate(**tokenized_prompt, max_new_tokens=256, temperature=0.7, do_sample=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))