In [242]:
import pandas as pd
import string
import nltk
from tqdm import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [243]:
nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/omar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [244]:
# Read ROCStories into pandas DataFrame
roc_stories_train_path = "../data/moral_stories/storal_label/mo2st_data/storal_en/storal_en_label_train.jsonl"
roc_stories_train_df = pd.read_json(path_or_buf=roc_stories_train_path, lines=True)

# Read ROCStories into pandas DataFrame
roc_stories_valid_path = "../data/moral_stories/storal_label/mo2st_data/storal_en/storal_en_label_valid.jsonl"
roc_stories_valid_df = pd.read_json(path_or_buf=roc_stories_valid_path, lines=True)

# Read ROCStories into pandas DataFrame
roc_stories_test_path = "../data/moral_stories/storal_label/mo2st_data/storal_en/storal_en_label_test.jsonl"
roc_stories_test_df = pd.read_json(path_or_buf=roc_stories_test_path, lines=True)

In [245]:
roc_stories_df = pd.concat([roc_stories_train_df, roc_stories_valid_df, roc_stories_test_df])

In [246]:
roc_stories_df['text'] = roc_stories_df['beginning'] + ' ' + roc_stories_df['story']

In [247]:
roc_stories_df = roc_stories_df[['text']]

In [248]:
def n_sentences(text):
    return len(text.split('.'))

In [249]:
roc_stories_df['n_sentences'] = roc_stories_df['text'].progress_apply(n_sentences)

100%|██████████| 1779/1779 [00:00<00:00, 225475.68it/s]


In [250]:
roc_stories_df = roc_stories_df[roc_stories_df['n_sentences'] <= 10]

In [251]:
len(roc_stories_df)

602

In [252]:
punctuation = set(['' if c == ' ' else c for c in string.punctuation])

In [253]:
def remove_punctuation(text):
    return ''.join(['' if c in punctuation else c for c in text])

In [254]:
def filter_stopwords(bigram):
    for word in bigram.split():
        if word in nltk_stopwords:
            return False
    return True

In [255]:
def extract_ngrams(text, n):
    res = []
    text = remove_punctuation(text)
    words = text.lower().split()
    two_grams = [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]
    two_grams = [b for b in two_grams if filter_stopwords(b) ]
    return set(two_grams)

In [256]:
def extract_bigrams_story(story):
    sentences = story.split('.')
    bigrams = [extract_ngrams(s, 2) for s in sentences]
    return bigrams

In [257]:
def bigram_overlap(set1, set2):
    return bool(set1 & set2)

In [258]:
def get_overlaps(bigrams, d):
    overlaps = []
    for i1 in range(len(bigrams) - d):
        b1 = bigrams[i1]
        for i2 in range(i1 + d, len(bigrams)):
            if bigram_overlap(b1, bigrams[i2]):
                overlaps.append((i1, i2))
    return overlaps

In [259]:
roc_stories_df[f'bigrams'] = roc_stories_df['text'].progress_apply(extract_bigrams_story)

100%|██████████| 602/602 [00:00<00:00, 6821.63it/s]


In [260]:
roc_stories_df[f'overlaps'] = roc_stories_df[f'bigrams'].progress_apply(lambda x: get_overlaps(x, 2))

100%|██████████| 602/602 [00:00<00:00, 99631.89it/s]


In [261]:
roc_stories_df[f'n_overlaps'] = roc_stories_df[f'overlaps'].progress_apply(lambda x: len(x))

100%|██████████| 602/602 [00:00<00:00, 458169.30it/s]


In [262]:
pruned_ds = roc_stories_df[roc_stories_df[f'n_overlaps'] > 0]

In [263]:
len(pruned_ds)

90

In [264]:
pruned_ds.head()

Unnamed: 0,text,n_sentences,bigrams,overlaps,n_overlaps
4,"Two travellers, walking in the noonday sun, sought the shade of a widespreading tree to rest. As they lay looking up among the pleasant leaves, they saw that it was a plane tree. ""How useless is the plane!"" Said one of them. ""It bears no fruit whatever, and only serves to litter the ground with leaves.""""Ungrateful creatures!"" Said a voice from the plane tree. ""You lie here in my cooling shade, and yet you say I am useless! thus ungratefully, o jupiter, do men receive their blessings!""",6,"[{noonday sun, widespreading tree, travellers walking, two travellers, sun sought}, {lay looking, plane tree, pleasant leaves}, {plane said, said one}, {fruit whatever}, {plane tree, creatures said, ungrateful creatures}, {useless thus, cooling shade, thus ungratefully, men receive}]","[(1, 4)]",1
10,"A fox fell into a well, and though it was not very deep, he found that he could not get out again. After he had been in the well a long time, a thirsty goat came by. The goat thought the fox had gone down to drink, and so he asked if the water was good. ""The finest in the whole country,"" Said the crafty fox, ""Jump in and try it. there is more than enough for both of us.""The thirsty goat immediately jumped in and began to drink. The fox just as quickly jumped on the goat's back and leaped from the tip of the goat's horns out of the wellthe foolish goat now saw what a plight he had got into, and begged the fox to help him out. But the fox was already on his way to the woods""If you had as much sense as you have beard, old fellow,"" He said as he ran, ""You would have been more cautious about finding a way to get out again before you jumped in.""",9,"[{fox fell}, {goat came, long time, thirsty goat}, {goat thought}, {whole country, crafty fox, fox jump, country said}, {}, {immediately jumped, goat immediately, thirsty goat}, {foolish goat, quickly jumped, goats back, wellthe foolish, goats horns}, {much sense, beard old, old fellow}, {}]","[(1, 5)]",1
23,"An ass was being driven along a road leading down the mountain side, when he suddenly took it into his silly head to choose his own path. He could see his stall at the foot of the mountain, and to him the quickest way down seemed to be over the edge of the nearest cliff. Just as he was about to leap over, his master caught him by the tail and tried to pull him back, but the stubborn ass would not yield and pulled with all his might. ""Very well,"" Said his master, ""Go your way, you willful beast, and see where it leads you.""With that he let go, and the foolish ass tumbled head over heels down the mountain side.",6,"[{road leading, driven along, mountain side, silly head, suddenly took}, {nearest cliff, quickest way, could see}, {ass would, stubborn ass, master caught}, {willful beast, well said, master go}, {let go, mountain side, foolish ass, ass tumbled, tumbled head}, {}]","[(0, 4)]",1
53,"There was once a countryman who possessed the most wonderful goose you can imagine, for every day when he visited the nest, the goose had laid a beautiful, glittering, golden egg. The countryman took the eggs to market and soon began to get rich. But it was not long before he grew impatient with the goose because she gave him only a single golden egg a day. He was not getting rich fast enough. Then one day, after he had finished counting his money, the idea came to him that he could get all the golden eggs at once by killing the goose and cutting it open. But when the deed was done, not a single golden egg did he find, and his precious goose was dead.",7,"[{every day, beautiful glittering, golden egg, glittering golden, wonderful goose}, {soon began, countryman took, get rich}, {golden egg, grew impatient, single golden}, {getting rich, fast enough, rich fast}, {one day, finished counting, golden eggs, could get, idea came}, {golden egg, precious goose, single golden}, {}]","[(0, 2), (0, 5), (2, 5)]",3
54,"One cold stormy day a goatherd drove his goats for shelter into a cave, where a number of wild goats had also found their way. The shepherd wanted to make the wild goats part of his flock; so he fed them well. But to his own flock, he gave only just enough food to keep them alive. When the weather cleared, and the shepherd led the goats out to feed, the wild goats scampered off to the hills. ""Is that the thanks I get for feeding you and treating you so well?"" Complained the shepherd. ""Do not expect us to join your flock,"" Replied one of the wild goats. ""We know how you would treat us later on, if some strangers should come as we did.""",8,"[{stormy day, wild goats, cold stormy, goatherd drove, also found, one cold}, {shepherd wanted, goats part, wild goats}, {enough food}, {goats scampered, weather cleared, wild goats, shepherd led}, {well complained}, {expect us, flock replied, wild goats, replied one}, {would treat, us later, treat us}, {}]","[(0, 3), (0, 5), (1, 3), (1, 5), (3, 5)]",5
