## Import packages

In [1]:
from collections import defaultdict
from copy import deepcopy
from tqdm import tqdm
from pathlib import Path
import random
import json
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from matplotlib.cm import ScalarMappable
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from textwrap import wrap

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")



## Read CSV, Split train-test

In [2]:
df = pd.read_csv('../data/otquad-v1.0 - Sheet1.csv')[['title', 'context',
                                                      'question', 'answer']]

print(len(df))
print(f"Number of Unique Contexts   : {df['context'].nunique()}")
print(f"Ratio of QA Pair-to-Context : {len(df)/df['context'].nunique()}")

df['context_length'] = df['context'].apply(len)
df['question_length'] = df['question'].apply(len)
df['answer_length'] = df['answer'].apply(len)


1097
Number of Unique Contexts   : 190
Ratio of QA Pair-to-Context : 5.773684210526316


In [3]:
# df = df.head(800)

# print(df.shape)

# print(f"Number of Unique Contexts   : {df['context'].nunique()}")
# print(f"Ratio of QA Pair-to-Context : {len(df)/df['context'].nunique()}")

# df['context_length'] = df['context'].apply(len)
# df['question_length'] = df['question'].apply(len)
# df['answer_length'] = df['answer'].apply(len)


In [4]:
# dff = df.sample(100, random_state=42)
# dff.to_csv('../data/sample_data.csv', index=False)

In [60]:
df_copy = deepcopy(df)
df_copy.drop_duplicates(subset='context', keep="last", inplace = True)
df_copy.reset_index(drop=True)

n = 19
test_fold_lists  = [df_copy.iloc[n*(ct-1):n*ct] for ct in range(1,11)]
train_fold_lists = []
for i in range(10):
  a_index = df_copy.set_index(df_copy.columns.tolist()).index
  b_index = test_fold_lists[i].set_index(df_copy.columns.tolist()).index
  mask = ~a_index.isin(b_index)
  result = df_copy.loc[mask]
  train_fold_lists.append(result)

# train_unique_context, test_unique_context = train_test_split(df_copy, 
#                                                              test_size=0.2,
#                                                              random_state=42)
train_list, test_list = [], []
for i in range(10):
  train = pd.merge(train_fold_lists[i], df, on="context")
  test = pd.merge(test_fold_lists[i], df, on="context")
  train_list.append(train)
  test_list.append(test)

# assert np.array_equal(sorted(train_unique_context['context'].values),
#                       sorted(np.unique(train['context'].values)))
# assert np.array_equal(sorted(test_unique_context['context'].values),
#                       sorted(np.unique(test['context'].values)))

In [61]:
for x in test_fold_lists:
  print(x.shape)

(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)
(19, 7)


In [62]:
for x in train_fold_lists:
  print(x.shape)

(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)
(171, 7)


In [63]:
# train_fold_lists[1]

In [64]:
# test_fold_lists[0]

In [65]:
# for i in range(len(train)):
#   for j in range(len(train)):
#     if i!=j:
#       if train['question_y'].values[i]==train['question_y'].values[j]:
#         print(train['question_y'].values[i])

In [78]:
def df_to_json(df, data_path, json_filename):
  start_index = [df['context'].values[i].lower().find(df['answer_y'].values[i].lower())
                 for i in range(len(df))]
  df['start_index_y'] = start_index

  for i in range(len(df)):
#       print(i, start_index[i])
#       print('Original : ',df['answer_y'].values[i].lower())
#       print('Extracted: ', df['context'].values[i].lower()[start_index[i]:start_index[i]+len(df['answer_y'].values[i])])
      assert df['answer_y'].values[i].lower() == df['context'].values[i].lower()[
        start_index[i]:start_index[i]+len(df['answer_y'].values[i])]

  print(f'Unique Contexts: {len(np.unique(df.context.values))}\n Ratio of QA Pair-to-Contexts: {len(df)/len(np.unique(df.context.values))}')

  squad_entries= []
  for i in tqdm(range(len(df))):
    id_num      = i
    context     = df['context'].values[i]
    question    = df['question_y'].values[i]
    answer_text = df['answer_y'].values[i]
    start       = df['start_index_y'].values[i]
    title       = df['title_y'].values[i]

    if start!=-1:
      answer={"text":answer_text,"answer_start":int(start)}
    else:
      continue
    new_entry={
               "qas": [
                 {
                  "id": int(id_num),
                  "question": question,
                  "answers": [answer]
                 }
                 ],
               "context": context
               }
    squad_entries.append({"title":title,
                          "paragraphs":[new_entry]})
        

  squad_json={"data": squad_entries,
            "version":1.0}

  with open(data_path/json_filename, "w") as writer:
      writer.write(json.dumps(squad_json, indent=4) + "\n")



In [80]:
# df_to_json(df, data_path, f'otquad.json')


In [71]:
data_path = Path('/net/kdinxidk03/opt/NFS/75y/data/qa/dataset_pos/otquad/')
# data_path = Path('../data/')
for i in range(10):
  print(i)
#   print(len(set(train_list[i+1]['context'].values.tolist()).intersection(
#     set(test_list[i]['context'].values.tolist()))))
  df_to_json(train_list[i], data_path, f'train_otquad_{i+1}.json')
  df_to_json(test_list[i], data_path, f'test_otquad_{i+1}.json')

100%|██████████| 880/880 [00:00<00:00, 38022.41it/s]
100%|██████████| 217/217 [00:00<00:00, 50841.47it/s]
100%|██████████| 998/998 [00:00<00:00, 66200.37it/s]

0
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.146198830409356
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 11.421052631578947
1
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.83625730994152



100%|██████████| 99/99 [00:00<00:00, 64000.63it/s]
100%|██████████| 912/912 [00:00<00:00, 80132.50it/s]
100%|██████████| 185/185 [00:00<00:00, 74739.57it/s]
100%|██████████| 958/958 [00:00<00:00, 87174.70it/s]
100%|██████████| 139/139 [00:00<00:00, 70496.77it/s]

Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 5.2105263157894735
2
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.333333333333333
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 9.736842105263158
3
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.60233918128655
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 7.315789473684211



100%|██████████| 997/997 [00:00<00:00, 86517.17it/s]
100%|██████████| 100/100 [00:00<00:00, 56058.59it/s]
100%|██████████| 1035/1035 [00:00<00:00, 61569.84it/s]
100%|██████████| 62/62 [00:00<00:00, 55048.02it/s]

4
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.830409356725146
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 5.2631578947368425
5
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 6.052631578947368
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 3.263157894736842



100%|██████████| 992/992 [00:00<00:00, 84497.67it/s]
100%|██████████| 105/105 [00:00<00:00, 62247.62it/s]
100%|██████████| 983/983 [00:00<00:00, 70891.88it/s]
100%|██████████| 114/114 [00:00<00:00, 70617.44it/s]

6
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.8011695906432745
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 5.526315789473684
7
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 5.748538011695906
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 6.0



100%|██████████| 1059/1059 [00:00<00:00, 85796.45it/s]
100%|██████████| 38/38 [00:00<00:00, 57852.47it/s]
100%|██████████| 1059/1059 [00:00<00:00, 13408.14it/s]

8
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 6.192982456140351
Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 2.0
9
Unique Contexts: 171
 Ratio of QA Pair-to-Contexts: 6.192982456140351



100%|██████████| 38/38 [00:00<00:00, 48137.59it/s]

Unique Contexts: 19
 Ratio of QA Pair-to-Contexts: 2.0





In [72]:
for i in range(10):
  print(test_list[i].shape)

(217, 14)
(99, 14)
(185, 14)
(139, 14)
(100, 14)
(62, 14)
(105, 14)
(114, 14)
(38, 14)
(38, 14)


In [None]:
# print('Train total length: ', len(train))
# print('Test total length: ', len(test))


## Plot q-type frequency

In [None]:
a = json.load(open('/net/kdinxidk03/opt/NFS/75y/data/qa/dataset_pos/squad/train_squad.json'))

questions = []
contexts  = []
answers   = []
idxs      = []
for d in a['data']:
  for p in d['paragraphs']:
    for qa in p['qas']:
      idxs.append(qa['id'])
      questions.append(qa['question'])
      answers.append(qa['answers'][0]['text'])
      contexts.append(p['context'])
    
print(len(contexts))
print(len(np.unique(contexts)))

In [None]:
df_squad = pd.DataFrame()
df_squad['question'] = questions
df_squad['answer'] = answers
df_squad['context'] = contexts

df_squad['context_length'] = df_squad['context'].apply(len)
df_squad['question_length'] = df_squad['question'].apply(len)
df_squad['answer_length'] = df_squad['answer'].apply(len)

In [None]:
def get_qtype_freq(df):
  q_list = df['question'].values.tolist()
  q_type = ['what', 'when', 'where', 'who', 'whom', 
            'which', 'why', 'how'] #, 'name', 'whose', 
  
  q_freq = {}
  for ques in q_list:
    q_tokens = ques.lower().split()
    for typ in q_type:
      if typ in q_tokens[0]:
        q_freq.setdefault(typ, []).append(ques)

  freq_qtype = [len(x) for x in list(q_freq.values())]
  print(freq_qtype, sum(freq_qtype))
  
  df_sorted = pd.DataFrame()
  df_sorted['QTYPE'] = list(q_freq.keys())
  df_sorted['QFREQ'] = freq_qtype
  df_sorted['n'] = freq_qtype
  df_sorted = df_sorted.sort_values("QFREQ", ascending=False)

  print(df_sorted.shape)
  
  return df_sorted
  

In [None]:
def plot_qtype_frequency(df_sorted, dataset_name):
  
  # Values for the x axis
  ANGLES = np.linspace(0.05, 2 * np.pi - 0.05, len(df_sorted), endpoint=False)

  # Cumulative length
  QFREQ = df_sorted["QFREQ"].values

  # # Mean gain length
  # MEAN_GAIN = df_sorted["mean_gain"].values

  # Region label
  QTYPE = df_sorted["QTYPE"].values

  # Number of tracks per region
  N = df_sorted["n"].values

  GREY12 = "#1f1f1f"

  # Set default font color to GREY12
  plt.rcParams["text.color"] = GREY12

  # The minus glyph is not available in Bell MT
  # This disables it, and uses a hyphen
  plt.rc("axes", unicode_minus=False)

  # Colors
  COLORS = ["#6C5B7B","#C06C84","#F67280","#F8B195"]

  # Colormap
  cmap = mpl.colors.LinearSegmentedColormap.from_list("my color", COLORS, N=256)

  # Normalizer
  norm = mpl.colors.Normalize(vmin=N.min(), vmax=N.max())

  # Normalized colors. Each number of tracks is mapped to a color in the 
  # color scale 'cmap'
  COLORS = cmap(norm(N))

  # Some layout stuff ----------------------------------------------
  # Initialize layout in polar coordinates
  fig, ax = plt.subplots(figsize=(6, 6), subplot_kw={"projection": "polar"})

  # Set background color to white, both axis and figure.
  # fig.patch.set_facecolor("white")
  # ax.set_facecolor("white")

  ax.set_theta_offset(100 * np.pi / 3)
  ax.set_ylim(-100, 350)

  # Add geometries to the plot -------------------------------------
  # See the zorder to manipulate which geometries are on top

  # Add bars to represent the cumulative track lengths
  ax.bar(ANGLES, QFREQ, color=COLORS, alpha=1, width=0.6, zorder=11) #, ls=(0, (4, 4))

  # Add dashed vertical lines. These are just references
  ax.vlines(ANGLES, 0, 350, alpha=0.5, color=GREY12, zorder=11)

  # # Add dots to represent the mean gain
  # ax.scatter(ANGLES, MEAN_GAIN, s=60, color=GREY12, zorder=11)

  # Add labels for the regions -------------------------------------
  # Note the 'wrap()' function.
  # The '5' means we want at most 5 consecutive letters in a word, 
  # but the 'break_long_words' means we don't want to break words 
  # longer than 5 characters.
  QTYPE = ["\n".join(wrap(r, 5, break_long_words=False)) for r in QTYPE]

  # Set the labels
  ax.set_xticks(ANGLES)
  ax.set_xticklabels(QTYPE, size=15);

  plt.show()
  
  sns.set(style="whitegrid")
  colors = sns.color_palette('pastel')[0:len(df_sorted)]

  plt.subplots(figsize=(8, 15))
  explode = (0, 0.02, 0.03, 0.05, 0.1, 0.15, 0.4, 0.6)
  plt.pie(df_sorted['QFREQ'].values, labels=df_sorted['QTYPE'].values,
          explode=explode, colors = colors, autopct='%1.1f%%', shadow=False,
          textprops={'fontsize': 14})
  plt.xticks(size=25)
  plt.title(f"Distribution of 'type of questions'\nin {dataset_name}", size=15)
  plt.show()


In [None]:
def density_plot_data(df, col_name_list, which_attr, dataset_name, xlimit, figsize=(6,4)):
  plt.figure(figsize=figsize)
  for col_name in col_name_list:
    print(len(df[col_name]))
    fig = sns.kdeplot(df[col_name], shade=True, label = col_name)
  plt.xlim(-100, xlimit)
  plt.xticks(size=14)
  plt.yticks(size=14)
  plt.xlabel('Length', size=15)
  plt.ylabel('Density\n', size=15)
  plt.legend(loc='best', fontsize=14)
  plt.title(f"Distribution of '{which_attr}'\nin {dataset_name}", size=15)
  plt.show()

In [None]:
df_sorted = get_qtype_freq(df_squad)
plot_qtype_frequency(df_sorted, 'SQuAD')

In [None]:
density_plot_data(df, ['context_length'], 'context length', 'OTQuAD', 4500)
density_plot_data(df, ['question_length', 'answer_length'], 'question and answer length', 'OTQuAD', 400, figsize=(6,4))
