In [4]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(1, '../scripts/')

import embeddings
import csv
from tqdm.notebook import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Using csv files 

In [5]:
print('Load the embedding model...')
tokenizer, model = embeddings.load_model()

print('Get the number of papers to process...')
with open('../data/raw/papers_raw.csv', 'r', encoding = 'utf-8') as file:
    line_count = sum(1 for line in file)

# Subtract 1 for the header if the CSV has a header
total_papers = line_count - 1


print('Processing...')
with open('../data/raw/papers_raw.csv', 'r', encoding='utf-8') as reader, \
     open('../data/vectors/papers_vectors.csv', 'w', encoding='utf-8', newline='') as writer:

    csv_reader = csv.reader(reader, delimiter='\t', quotechar='"')
    csv_writer = csv.writer(writer, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # Write a header to the new CSV if needed, e.g., "vector"
    csv_writer.writerow(["PaperID"] + list(range(0,768)))

    # Skip header of the raw papers CSV
    next(csv_reader)

    for line in tqdm(csv_reader, total=total_papers):
        text = line[2] + line[3]
        vector = embeddings.get_embedding(text, tokenizer, model)
        
        # Assuming the vector is a list of numbers, you can write it directly
        csv_writer.writerow(vector)

Load the embedding model...
Get the number of papers to process...
Processing...


  0%|          | 0/79882 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Using numpy binary matrix 

In [7]:
import numpy as np

# Define the shape of your data
num_vectors = total_papers
vector_length = 768

# Create an empty file filled with zeros
empty_data = np.zeros((num_vectors, vector_length))
np.save('../data/vectors/papers_vectors.npy', empty_data)

# Function to update a specific row in the .npy file
def update_npy_file(filename, row_idx, vector):
    with open(filename, 'r+b') as f:
        # Seek to the start of the row you want to update
        f.seek(row_idx * vector.nbytes)
        np.save(f, vector)

with open('../data/raw/papers_raw.csv', 'r', encoding='utf-8') as reader:
    csv_reader = csv.reader(reader, delimiter='\t', quotechar='"')
    
    # Skip header of the raw papers CSV
    next(csv_reader)

    for idx, line in tqdm(enumerate(csv_reader), total=total_papers):
        text = line[2] + line[3]
        vector = embeddings.get_embedding(text, tokenizer, model)
        update_npy_file('../data/vectors/papers_vectors.npy', idx, vector)

  0%|          | 0/79882 [00:00<?, ?it/s]

KeyboardInterrupt: 