In [20]:
#load shakespeare dataset
import numpy as np
import pandas as pd

In [21]:
data = np.load("model_2_doyle/doyle_training_sequences.npz")
X_train = data["X_train"]
y_train = data["y_train"]

#reconstruct training sentences using json
id_to_word = pd.read_json("model_2_doyle/doyle_id_to_word.json", typ="series").to_dict()
X_words = [" ".join([id_to_word[id] for id in seq if id != 0]) for seq in X_train]
y_words = [id_to_word[id] for id in y_train]

df = pd.DataFrame({"input": list(X_words), "target": list(y_words)})

In [22]:
X_train

array([[    2,     0,     0, ...,     0,     0,     0],
       [    2,   219,     0, ...,     0,     0,     0],
       [    2,   219, 13403, ...,     0,     0,     0],
       ...,
       [    2, 27985, 11377, ...,     0,     0,     0],
       [    2, 27985, 11377, ...,     0,     0,     0],
       [    2, 27985, 11377, ...,     0,     0,     0]],
      shape=(3672672, 20))

In [23]:
df

Unnamed: 0,input,target
0,<SOS>,the
1,<SOS> the,croxley
2,<SOS> the croxley,master
3,<SOS> the croxley master,<EOS>
4,<SOS>,mr
...,...,...
3672667,<SOS> and ever be your majesty most dutiful,and
3672668,<SOS> and ever be your majesty most dutiful and,obedient
3672669,<SOS> and ever be your majesty most dutiful an...,servant
3672670,<SOS> and ever be your majesty most dutiful an...,monmouth


In [24]:
#see only those rows whose target is <EOS>
df_eos = df[df["target"] == "<EOS>"]
print(f"{len(df_eos)} of {len(df)} rows ({len(df_eos) / len(df) * 100:.2f}%) have <EOS> as target")
df_eos.head()

189298 of 3672672 rows (5.15%) have <EOS> as target


Unnamed: 0,input,target
3,<SOS> the croxley master,<EOS>
24,<SOS> mr robert montgomery be seat at his desk...,<EOS>
38,<SOS> before he be the open ledger with the lo...,<EOS>
73,<SOS> at his elbow lie the wooden tray with th...,<EOS>
82,<SOS> but his spirit be too low for work,<EOS>


In [25]:
#see only those rows that starts with <SOS>
df_sos = df[df["input"].str.startswith("<SOS>")]
print(f"{len(df_sos)} of {len(df)} rows ({len(df_sos) / len(df) * 100:.2f}%) start with <SOS>")
df_sos.head()

3672672 of 3672672 rows (100.00%) start with <SOS>


Unnamed: 0,input,target
0,<SOS>,the
1,<SOS> the,croxley
2,<SOS> the croxley,master
3,<SOS> the croxley master,<EOS>
4,<SOS>,mr


In [26]:
#How many rows have UNK as target?
df_unk = df[df["target"] == "<UNK>"]
print(f"{len(df_unk)} of {len(df)} rows ({len(df_unk) / len(df) * 100:.2f}%) have <UNK> as target")

0 of 3672672 rows (0.00%) have <UNK> as target


In [27]:
#see how many rows only have <SOS> and <EOS>, i.e. length of 2
df_sos_eos = df[(df["input"] == "<SOS>") & (df["target"] == "<EOS>")] 
print(f"{len(df_sos_eos)} of {len(df)} rows ({len(df_sos_eos) / len(df) * 100:.2f}%) have only <SOS> and <EOS>")
df_sos_eos.head()

17 of 3672672 rows (0.00%) have only <SOS> and <EOS>


Unnamed: 0,input,target
81967,<SOS>,<EOS>
1211925,<SOS>,<EOS>
1234961,<SOS>,<EOS>
1694148,<SOS>,<EOS>
1694154,<SOS>,<EOS>
