In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from tqdm import tqdm

from transformers import CamembertForQuestionAnswering, CamembertTokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Loading the data

In [2]:
train_df = pd.read_json("../data/train.json")
valid_df = pd.read_json("../data/valid.json")

In [3]:
model = CamembertForQuestionAnswering.from_pretrained("illuin/camembert-base-fquad", return_dict = True)
tokenizer = CamembertTokenizer.from_pretrained("illuin/camembert-base-fquad")

In [4]:
def create_context(df):
    contexts = []
    for i in range(len(df)):
        for j in range(len(df.iloc[i]['data']['paragraphs'])):
            contexts.append(df.iloc[i]['data']['paragraphs'][j]['context'])
            
    return contexts

# Create a list of documents
train_contexts = create_context(train_df)
valid_contexts = create_context(valid_df)

In [5]:
def create_qa(df):
    questions, answers = [], []
    id_to_ans = {}
    for i in range(len(df)):
        for j in range(len(df.iloc[i]['data']['paragraphs'])):
            for k in range(len(df.iloc[i]['data']['paragraphs'][j]['qas'])):
                questions.append(df.iloc[i]['data']['paragraphs'][j]['qas'][k]['question'])
                answers.append((i,j))
            id_to_ans[(i,j)] = len(id_to_ans)
            
    return questions, answers, id_to_ans

train_q, train_a, train_id_to_ans = create_qa(train_df)
valid_q, valid_a, valid_id_to_ans = create_qa(valid_df)

In [6]:
inputs = tokenizer(train_q[0], train_contexts[0], return_tensors='pt')
outputs = model(**inputs)

In [7]:
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [21]:
start_scores

tensor([[-5.1259, -5.8077, -5.0375, -5.2039, -5.1382, -5.1136, -5.1713, -5.0823,
         -5.5343, -5.1002, -5.1096, -5.0973, -5.0348, -5.3269, -5.0947, -5.0843,
         -5.0613, -5.1262, -5.0773, -5.1468, -5.1316, -5.0586, -5.1459, -5.1058,
         -5.1489, -5.0493, -5.0796, -4.3796, -5.4184, -4.9754, -4.9541, -5.0099,
         -4.5929, -4.7575, -5.0295, -4.9471, -5.1131, -5.0331, -5.1148, -5.0050,
         -5.2137, -5.1609, -5.0595, -5.1975, -5.2424, -5.0931, -4.9423, -5.0292,
         -4.9916, -5.0120, -5.1879, -4.6993,  6.2430, -3.5208, -5.3572, -3.6778,
          0.6845, -3.9091, -4.0111, -5.9940, -6.1293, -4.8997, -4.9923, -5.0891,
         -5.0626, -5.0536, -4.9941, -4.9502, -5.0293, -4.8972, -5.0754, -5.0710,
         -5.0961, -5.1919, -5.2464, -5.0446, -5.0639, -5.0774, -5.0891, -5.0658,
         -5.0487, -4.8058, -5.0543, -5.0040, -5.1911, -5.0978, -5.0203, -5.1460,
         -5.1644, -5.2201, -5.0333, -5.0953, -5.0949, -5.4429, -4.9164, -4.8618,
         -5.0500, -5.0580, -

In [22]:
end_scores

tensor([[-5.4799, -4.3274, -5.5605, -5.4238, -5.4702, -5.5019, -5.4590, -5.5253,
         -5.0705, -5.5151, -5.5045, -5.5173, -5.5660, -5.3068, -5.5209, -5.5218,
         -5.5457, -5.4945, -5.5321, -5.4736, -5.4853, -5.5301, -5.4734, -5.4974,
         -5.4690, -5.5513, -5.5254, -5.9392, -5.2123, -5.5980, -5.6238, -5.5823,
         -5.8408, -5.7297, -5.5585, -5.6334, -5.4911, -5.5595, -5.4949, -5.5758,
         -5.4033, -5.4541, -5.5054, -5.4253, -5.3456, -5.5043, -5.6256, -5.5698,
         -5.5895, -5.5656, -5.4197, -5.7629,  0.9700, -5.7229, -2.8482, -5.3262,
          6.1737, -6.0650, -6.0564, -2.1825, -3.1971, -5.6617, -5.5950, -5.5251,
         -5.5421, -5.5458, -5.5864, -5.6193, -5.5573, -5.6554, -5.5210, -5.5247,
         -5.5101, -5.4168, -5.3878, -5.5588, -5.5398, -5.5364, -5.5256, -5.5453,
         -5.5533, -5.7167, -5.5457, -5.5847, -5.4174, -5.5187, -5.5762, -5.4745,
         -5.4612, -5.4145, -5.5670, -5.5201, -5.5191, -5.1960, -5.6496, -5.6847,
         -5.5497, -5.5444, -

In [11]:
np.argmax(list(start_scores[0]))

52

In [12]:
np.argmax(list(end_scores[0]))

56

In [18]:
tokenizer.decode(list(inputs['input_ids'][0])[52:56])

'Johann Elert Bod'

In [20]:
start_scores[0,52], end_scores[0,56]

(tensor(6.2430, grad_fn=<SelectBackward>),
 tensor(6.1737, grad_fn=<SelectBackward>))