In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader

## Load data

In [3]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
validate = pd.read_parquet('validate.parquet')

### collect all texts to one dataset

In [None]:

# with open('AllTexts.txt', 'w') as f:
#     pass  # This just creates the file, immediately closing it

# with open('AllTexts.txt', 'a') as f:  # Open file in append mode
#     for _, row in train.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in test.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in validate.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')

### train tokanizer

In [None]:
# spm.SentencePieceTrainer.train(
#     input = 'AllTexts.txt',
#     model_prefix='spm_AllTexts', 
#     vocab_size=30000,
# )

## Load Tokanizer

In [9]:
sp = spm.SentencePieceProcessor()
sp.load('spm_AllTexts.model')

True

In [None]:
def tokenize_file(file_path, sp_processor):
    tokenized_sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming each line in the file is a separate sentence or paragraph
            # Tokenize the line and add the list of tokens to the tokenized_sentences list
            tokenized_sentences.append(sp_processor.encode_as_pieces(line.strip()))
    return tokenized_sentences

### Tokinize all the Dataset

In [None]:
tokinized_sentences = tokenize_file("AllTexts.txt", sp)

### Export tokens to JSON

In [None]:
# import json
# with open("Tokens_AllText.json", 'w', encoding='utf-8') as file:
#     json.dump(tokinized_sentences, file, ensure_ascii=False, indent=4)

### Define W2V

In [None]:
vector_size = 128

In [None]:
w2v_model = Word2Vec(
    min_count  =20,
    window     =10,
    vector_size=vector_size,
    sample     =6e-5, 
    alpha      = 0.03, 
    min_alpha  = 0.0007, 
    negative   = 20,
    workers    = multiprocessing.cpu_count() - 1
)

In [None]:
# print(len(tokinized_sentences))
# w2v_model.build_vocab(tokinized_sentences)
# w2v_model.save("word2vec.model")

In [None]:
# with open("word2vec_vocab.txt", 'w') as vocab_file:
#     for word in w2v_model.wv.key_to_index.keys():
#         vocab_file.write(word + '\n')

In [None]:
# w2v_model.train(tokinized_sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1)
# w2v_model.save("word2vec.model")

In [None]:
w2v_model = Word2Vec.load("word2vec.model")

In [None]:
similar_words = w2v_model.wv.most_similar('▁hacker', topn=4)
print(similar_words)
print(w2v_model.wv.most_similar(sp.encode_as_pieces('Hacker')))

In [None]:
def to_embedding(sp, text, vector_size):
    tokens = sp.encode_as_pieces(text)

    embeddings = []
    for token in tokens:
        if (token in w2v_model.wv): 
            embeddings.append(w2v_model.wv[token])

    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(vector_size)

### Triples for training

In [11]:
def prepareTriplesTokens(dataframe):
    triples = []
    for index, row in dataframe.iterrows():
        available_indices = list(dataframe.index)
        available_indices.remove(index)
        
        for relevant in row['passages']['passage_text']:
            random_index = np.random.choice(available_indices)
            random_doc_index = np.random.choice(
                list(
                    range(
                        len(dataframe.iloc[random_index]['passages']['passage_text'])
                    )
                )
            )

            irrelevant = dataframe.iloc[random_index]['passages']['passage_text'][random_doc_index]

            triples.append([
                row['query'],
                relevant,
                irrelevant,
            ])

    return triples

train_triplets = prepareTriplesTokens(train)
test_triplets = prepareTriplesTokens(test)
validate_triplets = prepareTriplesTokens(validate)

In [18]:
print(train_triplets[:2])

[['what is rba', "Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.", 'This report describes the typical weather at the Double Eagle II Airport (Albuquerque, New Mexico, United States) weather station over the course of an average year. It is based on the historical records from 2001 to 2012. Earlier records are either unavailable or unreliable. The daily average low (blue) and high (red) temperature with percentile bands (inner band from 25th to 75th percentile, outer band from 10th to 90th percentile). The warm season lasts from May 23 to September 9

In [20]:
# Convert the list of triples to a DataFrame
columns = ['query', 'relevant', 'irrelevant']
train_triplets = pd.DataFrame(train_triplets, columns=columns)
test_triplets = pd.DataFrame(test_triplets, columns=columns)
validate_triplets = pd.DataFrame(validate_triplets, columns=columns)

# Export the DataFrame to a CSV file
train_triplets.to_parquet('train_triplets.parquet', engine='pyarrow') 
test_triplets.to_parquet('test_triplets.parquet', engine='pyarrow') 
validate_triplets.to_parquet('validate_triplets.parquet', engine='pyarrow') 