In [None]:
import pandas as pd

from accelerate import Accelerator

In [None]:
from config.config import CONFIG

In [None]:
from src.models.encoder import Encoder
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CONFIG['model'], use_fast=CONFIG['tokenizer_use_fast'])
encoder = Encoder(CONFIG).to(CONFIG['device'])

In [None]:
train_df = pd.read_csv(CONFIG['train_path'])
dev_df = pd.read_csv(CONFIG['dev_path'])

In [None]:
from src.models.utils import query_embedding

train_df['query_embed'] = train_df['query'].apply(lambda x: query_embedding(x, encoder, tokenizer, CONFIG))
dev_df['query_embed'] = dev_df['query'].apply(lambda x: query_embedding(x, encoder, tokenizer, CONFIG))

In [None]:
import json

# load document embeddings
with open(CONFIG['doc_embeds_path']) as f:
    doc_embeds = json.load(f)

In [None]:
from src.dataloaders.utils import get_train_val_dataloaders

train_dl, val_dl = get_train_val_dataloaders(CONFIG, train_df, dev_df, doc_embeds)

In [None]:
accelerator = Accelerator(gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'])

In [None]:
from src.training.trainner import Trainer
from src.models.dpr import DPRModel

model = DPRModel().to(CONFIG['device'])
trainer = Trainer(model, (train_dl, val_dl), CONFIG, accelerator)

In [None]:
trainer.train()

In [None]:
losses_df = pd.DataFrame({'epoch':list(range(1, CONFIG['epochs'] + 1)),
                          'train_loss':trainer.train_losses, 
                          'val_loss': trainer.val_losses
                         })
losses_df.to_csv(CONFIG['losses_path'], index=False)

In [None]:
from matplotlib import pyplot as plt

plt.plot(trainer.train_losses, color='red')
plt.plot(trainer.val_losses, color='orange')
plt.title('Loss')
plt.legend(['Train', 'Validation'], loc='upper right')