### Embeddings using SentenceTransformer

In [1]:
from sentence_transformers import SentenceTransformer

st_model = SentenceTransformer('bert-base-nli-mean-tokens')

  return self.fget.__get__(instance, owner)()


In [2]:
state_of_the_union_path = '/Users/shivramamurthi/models/state_of_the_union.txt' 

In [3]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader(state_of_the_union_path)
unchunked_docs = loader.load()

In [4]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)
chunked_docs = text_splitter.split_documents(unchunked_docs)
sentences_txt = [ doc.page_content for doc in chunked_docs ]
print(len(sentences_txt))
sentences_txt[0]

88


'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.'

#### 0.6 s to encode 88 sentences

In [5]:
st_embeddings = st_model.encode(sentences_txt)
print(st_embeddings.shape)
print(sentences_txt[0], st_embeddings[0])

(88, 768)
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 

With a duty to one another to the American people to the Constitution. 

And with an unwavering resolve that freedom will always triumph over tyranny. [-5.95337510e-01  3.82333487e-01  3.65825891e-01  2.53147274e-01
  2.29489878e-01 -9.63458002e-01  6.49914980e-01 -5.62279046e-01
  3.65178548e-02 -7.56203771e-01  4.52899843e-01  9.13165212e-01
  1.33935973e-01  3.49942416e-01 -8.26504230e-01  5.22772014e-01
  5.52295260e-02 -1.10473216e-01  6.39623553e-02 -2.17773944e-01
 -1.99016690e-01  1.47524565e-01  4.18507278e-01  4.94993001e-01
  9.14852321e-01  6.15755975e-01 -3.44736949e-02 -5.21782458e-01
 -6.09696567e-01  7.11840391e-01 -1.42915547

### Embeddings using local llama2

In [6]:
from langchain_community.embeddings import LlamaCppEmbeddings

llama_model_path='/Users/shivramamurthi/src/llama.cpp/models/llama-2-7b/ggml-model-q4_0.bin'

embeddings = LlamaCppEmbeddings(
    model_path=llama_model_path
)

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from /Users/shivramamurthi/src/llama.cpp/models/llama-2-7b/ggml-model-q4_0.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_gate.

#### 28s to embed 88 sentences

In [7]:
llama2_embeddings = embeddings.embed_documents(sentences_txt)
len(llama2_embeddings)


llama_print_timings:        load time =     386.48 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    5038.28 ms /   124 tokens (   40.63 ms per token,    24.61 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    5054.32 ms

llama_print_timings:        load time =     386.48 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    4440.11 ms /   112 tokens (   39.64 ms per token,    25.22 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    4453.58 ms

llama_print_timings:        load time =     386.48 ms
l

KeyboardInterrupt: 

In [None]:
print(sentences_txt[0], llama2_embeddings[0])

Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 

With a duty to one another to the American people to the Constitution. 

And with an unwavering resolve that freedom will always triumph over tyranny. [0.0023242115053150585, 0.0002475537968620424, 0.011010420782432263, 0.000868643902896479, -0.007935231340261754, -0.006551973387163167, 0.026386503159583324, -0.01882366731567656, -0.012369685174550681, -0.007982018012675948, -0.011451162901610686, -0.010787027305656689, 0.008323412902447988, 0.00723071928731067, -0.008138714636107186, 0.010411274870956463, -0.01841826593737851, 0.005382793545801603, -0.0037687783300482287, -0.01778974388369275, 0.04035385768573307, -0.013299394773183962, -0.01636081144

#### Computing similarities

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import SentenceTransformerEmbeddings

In [4]:
embeddings = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2')

##### Without any context `bank` is not similar to `river`

In [5]:
e1 = embeddings.embed_query('bank')
e2 = embeddings.embed_query('river')

cosine_similarity([e1], [e2])

array([[0.35636227]])

##### With some context `bank of a river` similar to `river`

In [7]:
e1 = embeddings.embed_query('bank of a river')
e2 = embeddings.embed_query('river')

cosine_similarity([e1], [e2])

array([[0.72273722]])

##### With more context financial institutions are more similar

In [9]:
e1 = embeddings.embed_query('bank of england')
e2 = embeddings.embed_query('river')
e3 = embeddings.embed_query('bank of scotland')

cosine_similarity([e1], [e2, e3])

array([[0.23622898, 0.82905475]])

### [Word2Vec embeddings](https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/)

In [2]:
!pip install nltk



In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.4-py3-none-any.whl.metadata (23 kB)
Downloading gensim-4.3.2-cp311-cp311-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.4-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.4


In [9]:
# Python program to generate word vectors using Word2Vec
import gensim

# importing all necessary modules
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action='ignore')


# Reads ‘alice.txt’ file
sample = open('/Users/shivramamurthi/models/state_of_the_union.txt')
s = sample.read()

# Replaces escape character with space
f = s.replace("\n", " ")

data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
	temp = []

	# tokenize the sentence into words
	for j in word_tokenize(i):
		temp.append(j.lower())

	data.append(temp)

print(data)

# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count=1,
								vector_size=100, window=5)

# Print results
print(model1.wv.similarity('ukraine', 'america'))
print(model1.wv.similarity('ukraine', 'union'))

# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count=1, vector_size=100,
								window=5, sg=1)

# Print results
print(model2.wv.similarity('ukraine', 'america'))
print(model2.wv.similarity('ukraine', 'union'))

sample.close()


[['madam', 'speaker', ',', 'madam', 'vice', 'president', ',', 'our', 'first', 'lady', 'and', 'second', 'gentleman', '.'], ['members', 'of', 'congress', 'and', 'the', 'cabinet', '.'], ['justices', 'of', 'the', 'supreme', 'court', '.'], ['my', 'fellow', 'americans', '.'], ['last', 'year', 'covid-19', 'kept', 'us', 'apart', '.'], ['this', 'year', 'we', 'are', 'finally', 'together', 'again', '.'], ['tonight', ',', 'we', 'meet', 'as', 'democrats', 'republicans', 'and', 'independents', '.'], ['but', 'most', 'importantly', 'as', 'americans', '.'], ['with', 'a', 'duty', 'to', 'one', 'another', 'to', 'the', 'american', 'people', 'to', 'the', 'constitution', '.'], ['and', 'with', 'an', 'unwavering', 'resolve', 'that', 'freedom', 'will', 'always', 'triumph', 'over', 'tyranny', '.'], ['six', 'days', 'ago', ',', 'russia', '’', 's', 'vladimir', 'putin', 'sought', 'to', 'shake', 'the', 'foundations', 'of', 'the', 'free', 'world', 'thinking', 'he', 'could', 'make', 'it', 'bend', 'to', 'his', 'menacing