# Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import pickle
import re      
import openai
import os
from sklearn.manifold import TSNE
import nbformat
import plotly.graph_objects as go  
from sklearn.neighbors import NearestNeighbors

from dotenv import load_dotenv

In [2]:
load_dotenv()

True

# More advanced cases of tokenization

Let's experiment with tokenizer used by BERT

In [5]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
text = 'FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week, see more at'

In [7]:
encoded_input = tokenizer(text, return_tensors='pt')

In [8]:
encoded_input

{'input_ids': tensor([[  101,  3582, 27439,  4710,  1030,  2605,  1035, 20014,  2063,  1030,
          1052,  5283,  2818,  2135, 28311,  1030, 23689, 11514,  4747,  1035,
          3000,  2005,  2108,  2327,  5117,  2372,  1999,  2026,  2451,  2023,
          2733,  1010,  2156,  2062,  2012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Chceck tokenizer vocal size

In [9]:
tokenizer.decode([2000])

'to'

Analize vocab values and tokens, reverse vocab dictionary to match tokens to words

We can find all word-token pairs in tokenizer.vocab.items() stored as tupples. Let's create a reversed dict, where we will be able to backengineer each token to corresponding characters

In [10]:
tokenizer_vocab_inverted ={v: k for k, v in tokenizer.vocab.items()}

You can also use tokenizer.decode([token]) but I wanted to show a use case of reversed dictionary

In [11]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")['input_ids']

In [12]:
# To easily access token index you need to convert tensor to a numpy value
inputs[0].detach().numpy()

array([  101,  7592,  1010,  2026,  3899,  2003, 10140,   102])

In [13]:
tokenizer.decode([1789])

'古'

## Backengineer tokenization process|

In [14]:
def tokenization_words(text, tokenizer):
    # Prepare function, which tokenizes text and then returns corresponding words
    
    token_strings = []
    tokens = tokenizer(text, return_tensors="pt")['input_ids'][0].detach().numpy()
    for token_idx in tokens:
        token_strings.append(tokenizer_vocab_inverted[token_idx])
        
    return token_strings

In [15]:
text = 'Hello, my dog is cute'

In [16]:
tokenization_words(text, tokenizer)

['[CLS]', 'hello', ',', 'my', 'dog', 'is', 'cute', '[SEP]']

In [17]:
text2 = "I misspelled my dog while at the vet"

In [18]:
tokenization_words(text2, tokenizer)

['[CLS]',
 'i',
 'miss',
 '##pel',
 '##led',
 'my',
 'dog',
 'while',
 'at',
 'the',
 'vet',
 '[SEP]']

## Experiment with tokenizing a few sentences

- Find a word, which is is tokenized only as syllables
- Find 5 words, which are tokenized as 2+ tokens, do you see any pattern here?

# Word vectorization

In [19]:
word_embeddings = pickle.load(open( "../data/word_embeddings_subset.p", "rb" ) )

In [20]:
word_embeddings.keys()

dict_keys(['country', 'city', 'China', 'Iraq', 'oil', 'town', 'Canada', 'London', 'England', 'Australia', 'Japan', 'Pakistan', 'Iran', 'gas', 'happy', 'Russia', 'Afghanistan', 'France', 'Germany', 'Georgia', 'Baghdad', 'village', 'Spain', 'Italy', 'Beijing', 'Jordan', 'Paris', 'Ireland', 'Turkey', 'Egypt', 'Lebanon', 'Taiwan', 'Tokyo', 'Nigeria', 'Vietnam', 'Moscow', 'Greece', 'Indonesia', 'sad', 'Syria', 'Thailand', 'Libya', 'Zimbabwe', 'Cuba', 'Ottawa', 'Tehran', 'Sudan', 'Kenya', 'Philippines', 'Sweden', 'Poland', 'Ukraine', 'Rome', 'Venezuela', 'Switzerland', 'Berlin', 'Bangladesh', 'Portugal', 'Ghana', 'Athens', 'king', 'Madrid', 'Somalia', 'Dublin', 'Qatar', 'Chile', 'Islamabad', 'Bahrain', 'Nepal', 'Norway', 'Serbia', 'Kabul', 'continent', 'Brussels', 'Belgium', 'Uganda', 'petroleum', 'Cairo', 'Denmark', 'Austria', 'Jamaica', 'Georgetown', 'Bangkok', 'Finland', 'Peru', 'Romania', 'Bulgaria', 'Hungary', 'Vienna', 'Kingston', 'Manila', 'Cyprus', 'Azerbaijan', 'Copenhagen', 'Fiji',

In [21]:
openai.api_key = os.getenv("OPENAI_API_KEY")

### Vectorize

Create a function, which takes tetx and openai embedding model and returns embedding

In [22]:
def get_embedding(text: str, model: str = "text-embedding-ada-002"):
    """Fetches embedding for the given text using OpenAI API."""
    if not text or not isinstance(text, str):  # Handle empty or invalid text
        return None

    try:
        response = openai.embeddings.create(
            model=model,
            input=text
        )
        return response.data[0].embedding  # Corrected way to access embedding
    except Exception as e:
        print(f"Error getting embedding for text '{text}': {e}")
        return None


In [23]:
df = pd.DataFrame(word_embeddings.keys(), columns = ["word"])

In [24]:
embedding_model = "text-embedding-ada-002"

In [64]:
## Evaluate if function works
get_embedding("word")

[-0.007513395976275206,
 -0.019661467522382736,
 0.001773905591107905,
 -0.00998739805072546,
 -0.01530095562338829,
 0.014661239460110664,
 -0.004961060825735331,
 -0.03159412369132042,
 -0.00014452761388383806,
 -0.01624094508588314,
 0.02173727937042713,
 0.02692028135061264,
 -0.0279386043548584,
 0.0006499152514152229,
 -0.011018777266144753,
 0.01759870909154415,
 0.05713053047657013,
 -0.004882728215306997,
 0.02509252354502678,
 -0.02498807944357395,
 -5.056426380178891e-05,
 -0.0005548554472625256,
 -0.0028917761519551277,
 -0.009536986239254475,
 -0.022677268832921982,
 0.0005377202178351581,
 0.0047391182743012905,
 -0.018878141418099403,
 0.0024740025401115417,
 -0.002126401988789439,
 0.0030892393551766872,
 -0.008962547406554222,
 -0.026136957108974457,
 -0.01579706184566021,
 -0.023473650217056274,
 -0.00832935981452465,
 -0.009445598348975182,
 -0.029400812461972237,
 -0.0031430930830538273,
 -0.011103637516498566,
 0.0171287152916193,
 0.00016706861788406968,
 -0.01278

In [26]:
## Vectorize words in df
df["vector"] = df.word.apply(get_embedding)


In [27]:
df["vector"] = df["vector"].apply(lambda x: np.array(x) if isinstance(x, list) else np.zeros(1536))

## Dimensionality reduction - TSNE 

We would like to see these vectors in 3D Space, let's experiment with TSNE.

Evaluate tsklearn.manifold TSNE docs and shrink embeddings to 3 dimentions

In [28]:
def reduce_dimensions_tsne(embeddings, n_components=3, perplexity=20, random_state=42):
    """
    Reduces high-dimensional embeddings using t-SNE.

    Parameters:
    - embeddings (numpy.ndarray): High-dimensional embeddings.
    - n_components (int): Target number of dimensions (3 for 3D visualization).
    - perplexity (int): Controls balance between local and global aspects.
    - random_state (int): Ensures reproducibility.

    Returns:
    - numpy.ndarray: Reduced embeddings (N samples, 3D).
    """
    tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=random_state)
    return tsne.fit_transform(embeddings)


In [29]:
# Stack embeddings for t-SNE transformation
full_vectors = np.vstack(df["vector"].values)  # Change column as needed

# Reduce dimensionality
reduced_vectors = reduce_dimensions_tsne(full_vectors )

In [30]:
df["reduced_vec"] = reduced_vectors.tolist()

In [31]:
df_vectors_3d = pd.DataFrame(reduced_vectors, columns = ["x", "y", "z"])

In [65]:
df["reduced_vec"][0]

[18.590864181518555, 8.112115859985352, 41.68663024902344]

In [33]:
df = df.merge(df_vectors_3d, left_index=True, right_index=True)

Check how many word embeddings are present in our sample

In [34]:
df.to_parquet("embeddings_test.parquet")

In [35]:
df[["x","y", "z"]].describe()

Unnamed: 0,x,y,z
count,243.0,243.0,243.0
mean,-0.420687,-0.882041,-0.189521
std,25.961742,23.676033,23.17832
min,-53.229256,-59.044952,-49.872135
25%,-23.145083,-17.549101,-18.643853
50%,2.360036,-0.95667,-0.232658
75%,20.684137,16.93948,15.588058
max,45.205791,51.008347,54.444401


Use provided wireframe to create a 3D plot

In [69]:
df = pd.read_parquet("embeddings_test.parquet")

In [70]:
df.rename(columns = {"word":"label"}, inplace=True)

In [71]:
RANGE = 50

In [72]:

trace0=go.Scatter3d(
        x = df.x,
        y=df.y,
        z=df.z,
        mode="markers",
        text = df.label
        )



data=[trace0]

figure=go.Figure(
    data=data,
    layout=go.Layout(
      
        scene=dict(
            xaxis=dict(title="x", range = (-RANGE,RANGE)),
            yaxis=dict(title="y",range = (-RANGE,RANGE)),
            zaxis=dict(title="z", range = (-RANGE,RANGE))
                  ),

    ))


name = ''

camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=0, y=-1, z=1)
)

figure.update_layout(scene_camera=camera, title=name)#

figure.show()

## Show coutry - capital vectors

Let's explore relations between a set of country-capital pairs

In [75]:
country_capitals_paris = [("France", "Paris"), ("England", "London"),("Mali", "Bamako"), ("Italy", "Rome"),
                          ("Poland", "Warsaw"), ("Spain", "Madrid"), ("Kenya", "Nairobi"), ("Germany", "Berlin"), 
                          ("Japan", "Tokyo"), ("Beijing", "China"), ('Jordan', 'Amman')]

In [81]:
## Prepare pairs (subsets of df) of country and capital vectors 

data=[]
for pair in country_capitals_paris:
    capital = pair[0]
    country = pair[1]
    
    df_pair = df.loc[df.label.apply(lambda x: x in [capital, country])]
                                            
    trace=go.Scatter3d(
        x =df_pair.x,
        y=df_pair.y,
        z=df_pair.z,
        mode="markers+lines",
        text = df_pair.label
        )
    
    data.append(trace)

In [82]:
df_pair

Unnamed: 0,label,vector,reduced_vec,x,y,z
25,Jordan,"[0.004192049149423838, -0.024402255192399025, ...","[41.1163330078125, -11.857372283935547, 26.416...",41.116333,-11.857372,26.416384
171,Amman,"[-0.005985997151583433, -0.01016854215413332, ...","[15.295663833618164, -26.73141098022461, -4.19...",15.295664,-26.731411,-4.191685


In [83]:
figure=go.Figure(
    data=data,
    layout=go.Layout(
      
        scene=dict(
            xaxis=dict(title="x", range = (-RANGE,RANGE)),
            yaxis=dict(title="y",range = (-RANGE,RANGE)),
            zaxis=dict(title="z", range = (-RANGE,RANGE))
                  ),

    ))


name = ''
# Default parameters which are used when `layout.scene.camera` is not provided
camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=0, y=-1, z=1)
)

figure.update_layout(scene_camera=camera, title=name)#

figure.show()

## Predict capital of Spain

In [45]:
# Create country capital vector by substracting two vectors e.g. France and Paris
country_capital_vector = word_embeddings["France"] - word_embeddings["Paris"]

No we will use words vectorizations to predict country's capital


In [86]:
country_capital_vector  = np.array(df.query("label == 'France'").reduced_vec.values[0]) - np.array(df.query("label == 'Paris'").reduced_vec.values[0])

In [87]:
country_capital_vector

array([-0.68396187,  5.94998884, -5.21219444])

In [89]:
## Calculate expected vector for Englan's capital based on previously calculated country_capital_vector
expected_capital_vector = np.array(df.query("label == 'England'").reduced_vec.values[0]) - country_capital_vector

## Leverage nearest neighbors search to find corresponding vector

Searching similar neighbors in 300D vector space can be tricky and brute-force calculations will cost a lot of computation. We can use sklearn NearestNeighbors algorithm

In [90]:
# Fit nearest neighbors based on X, set n_neighbors to 3
nearest_neighbors = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(reduced_vectors )

In [91]:
# Find distance and idx of 3 nearest neighbors for expected capital vector
dist, idx = nearest_neighbors.kneighbors(expected_capital_vector.reshape(1, -1))

In [92]:
idx[0]

array([ 7,  8, 26])

In [93]:
dist

array([[3.35663304, 7.93959331, 9.22910559]])

In [95]:
#Let's see if any of nearest neighbors are correct
df.label.loc[idx[0]]

7      London
8     England
26      Paris
Name: label, dtype: object

Create a function, which will predict given countrys capital

In [99]:
def get_capital(country, df_vec):
    """Create a function, which returns predicted capital for any given country
    Word embeddings and country_capital_vector can be treated as global params 
    and do not need to be included in function args for simplicity
    """
    # Compute the expected capital vector
    expected_capital_vector = (
        np.array(df.query(f"label == '{country}'").reduced_vec.values[0])
        - country_capital_vector
    )
    
    # Get distances and indices for the nearest neighbors
    dist, idx = nearest_neighbors.kneighbors(expected_capital_vector.reshape(1, -1))
    
    # Check if the first neighbor is the same as the given country
    # If so, return the second neighbor, otherwise return the first neighbor
    first_neighbor_word = df_vec.loc[idx[0][0], 'label']
    if first_neighbor_word == country:
        return df_vec.loc[idx[0][1]]
    else:
        return df_vec.loc[idx[0][0]]

In [102]:
# Print country capitals for the following list
countries = ["England", "Poland","Kenya", "Germany"]

for country in countries:
    capital = get_capital(country, df).label
    print(f"{capital} is the capital of {country}")

London is the capital of England
Warsaw is the capital of Poland
Nigeria is the capital of Kenya
Berlin is the capital of Germany


In [103]:
# Print country capitals for the following list
countries = ["Japan", "China"]

for country in countries:
    capital = get_capital(country, df_vec).label
    print(f"{capital} is the capital of {country}")

Egypt is the capital of Japan
Japan is the capital of China


## Semantic similarity can be deceiving

In [106]:
### Prepare a function, to compare vector similarity between input sentence and a set of target sentences


def vector_similarity(vec1, vec2):
    """
    Returns the cosine similarity between two vectors.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def compare_sentences(
    input_sentence: str, 
    target_sentences: list, 
    model: str = "text-embedding-ada-002"
):
    """
    1. Embeds the input sentence.
    2. Embeds each of the target sentences.
    3. Calculates and sorts them by similarity to the input sentence.
    4. Prints results from closest to furthest.
    """

    # Get embedding for the source (input) sentence
    input_embedding = get_embedding(input_sentence, model=model)
    if input_embedding is None:
        print("Failed to get embedding for the source sentence.")
        return
    
    # List to store (target_sentence, similarity)
    results = []

    # Embed each target sentence and compute similarity
    for t_sentence in target_sentences:
        t_embedding = get_embedding(t_sentence, model=model)
        if t_embedding is None:
            # If embedding fails, skip or handle accordingly
            print(f"Skipping sentence due to embedding error: {t_sentence}")
            continue
        
        similarity_score = vector_similarity(input_embedding, t_embedding)
        results.append((t_sentence, similarity_score))
    
    # Sort by similarity (descending: highest similarity first)
    results.sort(key=lambda x: x[1], reverse=True)

    # Print the source sequence
    print(f"Source sequence: {input_sentence}\n")
    print("Target sentences ranked by similarity (descending):")
    
    for idx, (sentence, sim_score) in enumerate(results, start=1):
        print(f"{idx}. '{sentence}' (similarity: {sim_score:.4f})")


In [105]:
input_sentence = "I do not eat diary and animal products, I don't want to hurt cows"
target_sentence = ["We offer greek salad with plant based feta", 
                   "Here is some amazing steak from grass fed cows",
                    "Double cheese pizza is on promo today"] 


In [59]:
compare_sentences(input_sentence, target_sentence)

Source sequence: I do not eat diary and animal products, I don't want to hurt cows

Target sentences ranked by similarity (descending):
1. 'Here is some amazing steak from grass fed cows' (similarity: 0.8087)
2. 'We offer greek salad with plant based feta' (similarity: 0.7774)
3. 'Double cheese pizza is on promo today' (similarity: 0.7519)


In [60]:
input_sentence = "I am looking for a red Ferrari"
target_sentence = ["This Blue Lamborghini sounds great", 
                   "Here are red sport shoes", 
                   "Passat 1.9 TDI has the best engine"] 


In [61]:
compare_sentences(input_sentence, target_sentence)

Source sequence: I am looking for a red Ferrari

Target sentences ranked by similarity (descending):
1. 'Here are red sport shoes' (similarity: 0.8340)
2. 'This Blue Lamborghini sounds great' (similarity: 0.8261)
3. 'Passat 1.9 TDI has the best engine' (similarity: 0.7479)


In [62]:
input_sentence = "Aby odpowiedzieć na to pytanie musimy przeanalizować art. 10 pkt. 8 kodeksu podstępowania cywilnego"
target_sentence = ["Art. 100 kodeksu podstępowania cywilnego mówi o problemie zadłużenia", 
                   "W kodeksie cywilnym, artykule 8 jest mowa o karze 10 zł", 
                   "Chodzi tutaj o artykuł 10, pkt 8 kodeksu postępowania cywilnego, który wznawia postępowanie po 10 dniach"] 


In [63]:
compare_sentences(input_sentence, target_sentence)

Source sequence: Aby odpowiedzieć na to pytanie musimy przeanalizować art. 10 pkt. 8 kodeksu podstępowania cywilnego

Target sentences ranked by similarity (descending):
1. 'W kodeksie cywilnym, artykule 8 jest mowa o karze 10 zł' (similarity: 0.8884)
2. 'Art. 100 kodeksu podstępowania cywilnego mówi o problemie zadłużenia' (similarity: 0.8848)
3. 'Chodzi tutaj o artykuł 10, pkt 8 kodeksu postępowania cywilnego, który wznawia postępowanie po 10 dniach' (similarity: 0.8778)
