In [1]:
import io

import numpy as np
import pandas as pd
import plotly.express as px
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)
train_data = JsonlDataReader(file_name='train.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=True, remove_duplicates=True)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [20]:
import sentencepiece
text_input = io.BytesIO(bytes('\n'.join(preprocessed_train.texts), 'utf-8'))
tokenizer_model = io.BytesIO()

sentencepiece.SentencePieceTrainer.train(sentence_iterator=text_input, model_writer=tokenizer_model, vocab_size=10000)


In [21]:
sp = sentencepiece.SentencePieceProcessor(model_proto=tokenizer_model.getvalue())


In [22]:
preprocessed_train.texts[0]

'However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS , IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).'

In [24]:
sp.encode(preprocessed_train.texts[0: 3], out_type=str)

[['▁However',
  ',',
  '▁how',
  '▁frataxin',
  '▁interacts',
  '▁with',
  '▁the',
  '▁Fe',
  '-',
  'S',
  '▁cluster',
  '▁biosynthesi',
  's',
  '▁components',
  '▁remain',
  's',
  '▁unclear',
  '▁as',
  '▁direct',
  '▁one',
  '-',
  'to',
  '-',
  'one',
  '▁interactions',
  '▁with',
  '▁each',
  '▁component',
  '▁were',
  '▁reported',
  '▁(',
  'Isc',
  'S',
  '▁,',
  '▁',
  'Isc',
  'U',
  '/',
  'I',
  's',
  'u',
  '1',
  '▁[6,',
  '11',
  ',',
  '1',
  '6]',
  '▁or',
  '▁I',
  'SD',
  '11',
  '/',
  'I',
  's',
  'd',
  '11',
  '▁[14,15]',
  ').'],
 ['▁In',
  '▁the',
  '▁study',
  '▁by',
  '▁,',
  '▁spike',
  's',
  '▁were',
  '▁sampled',
  '▁from',
  '▁the',
  '▁field',
  '▁at',
  '▁the',
  '▁point',
  '▁of',
  '▁physiological',
  '▁ro',
  'b',
  'in',
  'son',
  '▁et',
  '▁al',
  '.',
  ':',
  '▁genomic',
  '▁regions',
  '▁influen',
  'c',
  'ing',
  '▁root',
  '▁traits',
  '▁in',
  '▁',
  'bar',
  'ley',
  '▁11',
  '▁of',
  '▁13',
  '▁m',
  'at',
  'ur',
  'ity',
  ',',
  '