In [1]:
!pip install torch
!pip install fasttext



In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity # for evaluating bert model embedding

# 1. Embedding using Bert

In [3]:
df_total = pd.read_csv("cleaned_Ubuntu-fa.csv")
df_total = df_total[["Cleaned_Farsi","Cleaned_English"]] # needed columns
df = df_total.head(1000)                                 # because training all rows took too long

In [4]:
# turn string to list, for not getting an error from persian tokenizer!
english_texts = df['Cleaned_English'].astype(str).tolist()
persian_texts = df['Cleaned_Farsi'].astype(str).tolist()

In [5]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Tokenize English and Persian texts
english_tokens = [tokenizer(text, return_tensors='pt', padding=True, truncation=True) for text in english_texts]
persian_tokens = [tokenizer(text, return_tensors='pt', padding=True, truncation=True) for text in persian_texts]

In [7]:
# Load pre-trained BERT model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-cased')
model.eval()

# Function to get embeddings
def get_embeddings(tokens):
    with torch.no_grad():
        outputs = model(**tokens)
        last_hidden_states = outputs.last_hidden_state
        cls_embedding = last_hidden_states[:, 0, :]
    return cls_embedding

# Get embeddings for English and Persian texts
english_embeddings = [get_embeddings(tokens) for tokens in english_tokens]
persian_embeddings = [get_embeddings(tokens) for tokens in persian_tokens]

# Convert embeddings to lists or numpy arrays for further processing
english_embeddings = [embedding.numpy() for embedding in english_embeddings]
persian_embeddings = [embedding.numpy() for embedding in persian_embeddings]

# Saving the embeddings to a file
np.save('english_embeddings.npy', english_embeddings)
np.save('persian_embeddings.npy', persian_embeddings)

Blocks below are for evaluating the performance of Bert:

In [8]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def get_word_embedding(word, context_sentence):
    """Get the embedding for a word in a specific context using BERT."""
    # Tokenize the context sentence
    inputs = tokenizer(context_sentence, return_tensors='pt', truncation=True)

    # Tokenize the word
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)

    # Find the word indices in the tokenized sentence
    input_ids = inputs['input_ids'][0]
    word_indices = []
    for i in range(len(input_ids) - len(word_ids) + 1):
        if torch.equal(input_ids[i:i + len(word_ids)], torch.tensor(word_ids)):
            word_indices.extend(range(i, i + len(word_ids)))

    if not word_indices:
        print(f"Warning: Word '{word}' not found in the context sentence.")
        return None

    # Get the hidden states from BERT
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

    # Extract the embedding for the word (averaging the tokens if it consists of more than one sub-token)
    word_embedding = hidden_states[0, word_indices, :].mean(dim=0).numpy()

    # Check for NaN values in the embedding
    if np.isnan(word_embedding).any():
        print(f"Warning: NaN values detected in the embedding for '{word}'")
        return None  # or you can return a default vector, e.g., np.zeros_like(word_embedding)

    return word_embedding

def find_nearest_neighbors(target_word, target_context, candidate_words, top_n=5):
    """Find the nearest neighbors to the target word using BERT embeddings."""
    # Get the embedding for the target word in its context
    target_embedding = get_word_embedding(target_word, target_context)
    if target_embedding is None:
        raise ValueError(f"Embedding for the target word '{target_word}' is invalid.")

    # Compute similarities between the target word and each candidate word
    similarities = {}
    for word in candidate_words:
        word_embedding = get_word_embedding(word, target_context)
        if word_embedding is not None:
            similarity = cosine_similarity([target_embedding], [word_embedding])[0][0]
            similarities[word] = similarity

    # Sort the words by similarity and select the top N neighbors
    nearest_neighbors = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return nearest_neighbors

In [9]:
# Example evaluation
target_word = "file"
target_context = "keeping document and record and file of your report is important for the graphics theme"
candidate_words = ["document", "record", "report", "graphics", "theme"]

# change top_n value if you add more candidate words
nearest_neighbors = find_nearest_neighbors(target_word, target_context, candidate_words, top_n=5)

# Print the top 3 nearest neighbors
for word, similarity in nearest_neighbors:
    print(f"Word: {word}, Similarity Score: {similarity}")

Word: record, Similarity Score: 0.8624204397201538
Word: document, Similarity Score: 0.7298275828361511
Word: report, Similarity Score: 0.5209749937057495
Word: graphics, Similarity Score: 0.41326454281806946
Word: theme, Similarity Score: 0.2993508577346802


# 2. Embedding in FastText

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
df = df_total                 # fasttext is lighter and can handle all the data

# make a column that labels persian text for the parallel english text
df["prefix"] = "__label__"
df['data'] = df['prefix'] + df['Cleaned_Farsi'] + df['Cleaned_English']
df["data"].to_csv("data.txt",index=False)

In [12]:
import random

with open('data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

random.shuffle(lines)

In [13]:
# split data for train/test

split_ratio = 0.8

split_index = int(split_ratio * len(lines))

train_lines = lines[:split_index]
test_lines = lines[split_index:]

In [14]:
with open('train.txt', 'w', encoding='utf-8') as train_file:
    train_file.writelines(train_lines)

with open('test.txt', 'w', encoding='utf-8') as test_file:
    test_file.writelines(test_lines)

In [15]:
with open('train.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

lines = [line for line in lines if len(line.strip().split()) > 1]

with open('train_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(lines)

In [16]:
with open('test.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

lines = [line for line in lines if len(line.strip().split()) > 1]

with open('test_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(lines)

In [17]:
import fasttext

model = fasttext.train_supervised(input="train_cleaned.txt", epoch=100, lr=1.0, wordNgrams=2, verbose=2, minCount=1)
model.save_model("model.bin")

Blcoks below are for evaluating the performance of FastText

In [18]:
result = model.test("test_cleaned.txt")

print(f"Number of samples: {result[0]}")
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")

Number of samples: 887
Precision: 0.5152198421645998
Recall: 0.5152198421645998


In [19]:
# Load your trained model
model = fasttext.load_model('model.bin')

In [20]:
word_vector = model.get_word_vector("authentication")
print(word_vector)

[-0.37016302 -0.5486845  -0.22444446  0.07106098  0.09364774 -0.12465325
  0.10660455 -0.4871416  -0.21357687  0.02793686  0.2102914   0.20017464
  0.3719689  -0.7714762  -0.29769063 -0.26139447 -0.44700542 -0.06228543
  0.09944005 -0.18963099 -0.08423263 -0.33257824 -0.13386889 -0.37877586
  0.30146214 -0.12518041 -0.02953938 -0.10176296  0.2351937  -0.17748615
  0.06994535 -0.07155325 -0.1190361   0.17922679  0.24027514  0.41006702
 -0.32520977 -0.30459452 -0.08801641 -0.4387047   0.04408648 -0.00471935
 -0.21851575  0.09336521 -0.49029195 -0.25932747  0.5399727  -0.00917213
  0.10246054  0.10208292  0.35060412 -0.10148465  0.1732219  -0.00169522
  0.4348695  -0.08302036  0.3834335  -0.29963663 -0.5304673  -0.42443818
  0.11796713 -0.4875522   0.04810935  0.33168545 -0.16425323 -0.04012479
  0.00511317 -0.00417817  0.53839684 -0.30340454 -0.23942363 -0.30235043
  0.0916883   0.17722626 -0.04170152 -0.36882523  0.28472888  0.03066347
 -0.05397275  0.10758166  0.39303797  0.20922975 -0

In [21]:
sentence_vector = model.get_sentence_vector("bad authentication response from server")
print(sentence_vector)

[-0.11008112 -0.02610775 -0.3189056   0.04961057  0.0407434   0.01536317
  0.09127185  0.10490729 -0.0500894  -0.11918949  0.02716932  0.35834017
  0.10750129 -0.347848   -0.17777717 -0.17380951 -0.02726148 -0.02123589
  0.07377699 -0.04268629 -0.03722494  0.1074599  -0.06218567 -0.01664681
 -0.07355169 -0.05608751  0.0018218   0.25083756 -0.01815252 -0.06909644
 -0.06544725  0.17526896  0.02741451 -0.01463091  0.15315399  0.16924553
 -0.06835879 -0.18213184 -0.13233447 -0.11862053  0.00150187  0.20361026
  0.09231488 -0.22798574 -0.34210882  0.06407432  0.27326524 -0.03773721
  0.02400308  0.10411638  0.06865181 -0.10726342  0.06456694 -0.12413567
  0.10071895  0.01451605  0.12047452 -0.09830119  0.08737831 -0.1426177
 -0.04545145 -0.17377996 -0.11803102 -0.03133576  0.0028674  -0.13329779
  0.30386424 -0.16214018  0.18414883 -0.10412005 -0.14516914 -0.10073404
 -0.07270758  0.01443606 -0.0624998  -0.0373597   0.02665921  0.12145304
 -0.01364399  0.23549755  0.14918913  0.15001759  0.

In [22]:
with open('word_vectors.txt', 'w') as f:
    for word in model.get_words():
        vector = model.get_word_vector(word)
        f.write(f"{word} {' '.join(map(str, vector))}\n")

print("Word vectors have been saved to word_vectors.txt")

Word vectors have been saved to word_vectors.txt


In [23]:
model.get_nearest_neighbors("disc", 5)

[(0.7846036553382874, 'drive'),
 (0.7521096467971802, 'گردان'),
 (0.7510446310043335, 'rewritable'),
 (0.7427465319633484, 'بازنویسی'),
 (0.7196008563041687, 'دیسکdisc')]