In [125]:
import pandas as pd
import string
import re
from tqdm import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [126]:
# Read ROCStories into pandas DataFrame
roc_stories_path = "../data/rocstories/ROCStories.csv"
roc_stories_df = pd.read_csv(roc_stories_path, sep=',', header=0)

In [127]:
roc_stories_df.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,9a51198e-96f1-42c3-b09d-a3e1e067d803,Overweight Kid,Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a change.,They got themselves and Dan on a diet.
1,617e7ada-3878-488d-bd56-40695b91f053,The Bike Accident,Carrie had just learned how to ride a bike.,She didn't have a bike of her own.,Carrie would sneak rides on her sister's bike.,She got nervous on a hill and crashed into a wall.,The bike frame bent and Carrie got a deep gash on her leg.
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long walk.,"After walking for over a mile, something happened.",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to her first.
3,d173b7de-4611-4cdf-934c-912834755e41,The bad customer.,Jane was working at a diner.,"Suddenly, a customer barged up to the counter.",He began yelling about how long his food was taking.,Jane didn't know how to react.,"Luckily, her coworker intervened and calmed the man down."
4,af0fd5a4-de36-47ba-8aa2-e99d10986d7a,Being Patient,I was talking to my crush today.,She continued to complain about guys flirting with her.,I decided to agree with what she says and listened to her patiently.,"After I got home, I got a text from her.",She asked if we can hang out tomorrow.


In [128]:
punctuation = set(['' if c == ' ' else c for c in string.punctuation])

In [129]:
def remove_punctuation(text):
    return ''.join(['' if c in punctuation else c for c in text])

In [130]:
def extract_bigrams(text):
    res = []
    text = remove_punctuation(text)
    words = text.lower().split()
    two_grams = [" ".join(words[i:i+2]) for i in range(len(words) - 1)]
    return set(two_grams)

In [131]:
for n in range(1, 6):
    roc_stories_df[f'bigrams_{n}'] = roc_stories_df[f'sentence{n}'].progress_apply(extract_bigrams)

100%|██████████| 98161/98161 [00:00<00:00, 128541.20it/s]
100%|██████████| 98161/98161 [00:00<00:00, 119974.95it/s]
100%|██████████| 98161/98161 [00:00<00:00, 122658.63it/s]
100%|██████████| 98161/98161 [00:00<00:00, 121032.10it/s]
100%|██████████| 98161/98161 [00:00<00:00, 113602.54it/s]


In [132]:
def bigram_overlap(set1, set2):
    return bool(set1 & set2)

In [133]:
overlapping_sentences = [(1,3), (1,4), (2,4), (3,5)]

In [134]:
for src, tgt in overlapping_sentences:
    roc_stories_df[f'overlap_{src}_{tgt}'] = roc_stories_df[[f'bigrams_{src}', f'bigrams_{tgt}']].progress_apply(lambda x: bigram_overlap(x[f'bigrams_{src}'], x[f'bigrams_{tgt}']), axis=1)

100%|██████████| 98161/98161 [00:01<00:00, 83242.89it/s]
100%|██████████| 98161/98161 [00:05<00:00, 16753.98it/s]
100%|██████████| 98161/98161 [00:01<00:00, 81317.80it/s]
100%|██████████| 98161/98161 [00:01<00:00, 83061.92it/s]


In [135]:
pruned_ds = roc_stories_df[roc_stories_df['overlap_1_3'] | roc_stories_df['overlap_1_4'] | roc_stories_df['overlap_2_4']]

In [136]:
len(pruned_ds)

25067

In [137]:
sentence_4_pruned_ds = roc_stories_df[roc_stories_df['overlap_1_4'] | roc_stories_df['overlap_2_4']]

In [138]:
len(sentence_4_pruned_ds)

18643

In [139]:
pruned_ds.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5,bigrams_1,bigrams_2,bigrams_3,bigrams_4,bigrams_5,overlap_1_3,overlap_1_4,overlap_2_4,overlap_3_5
2,79b0da1f-e460-4173-ba58-8c9e2553c53a,Beach,Morgan enjoyed long walks on the beach.,She and her boyfriend decided to go for a long walk.,"After walking for over a mile, something happened.",Morgan decided to propose to her boyfriend.,Her boyfriend was upset he didn't propose to her first.,"{the beach, enjoyed long, on the, morgan enjoyed, walks on, long walks}","{decided to, go for, and her, a long, to go, long walk, her boyfriend, boyfriend decided, she and, for a}","{over a, after walking, walking for, mile something, something happened, a mile, for over}","{to her, decided to, morgan decided, her boyfriend, propose to, to propose}","{to her, didnt propose, her first, was upset, her boyfriend, propose to, upset he, he didnt, boyfriend was}",False,False,True,False
7,a4a9aaca-d3d4-46b4-807c-ef75aea68c56,Too sunny for Sunny,Sunny enjoyed going to the beach.,"As she stepped out of her car, she realized she forgot something.",It was quite sunny and she forgot her sunglasses.,Sunny got back into her car and heading towards the mall.,Sunny found some sunglasses and headed back to the beach.,"{sunny enjoyed, the beach, to the, going to, enjoyed going}","{of her, realized she, she forgot, car she, stepped out, she stepped, her car, as she, forgot something, out of, she realized}","{was quite, quite sunny, she forgot, sunny and, and she, it was, forgot her, her sunglasses}","{into her, heading towards, the mall, sunny got, got back, back into, and heading, towards the, her car, car and}","{the beach, headed back, found some, and headed, sunny found, to the, back to, some sunglasses, sunglasses and}",False,False,True,False
17,25dfd390-d56a-4dca-82bd-508c6ba6116d,Not Quite A Fairytale,A die hard shopper was waiting in the long line outside.,It was miserably cold.,The shopper saw a homeless man shivering in the alleyway.,He gave up his place in the line and brought a gift back from his car.,The shopper gave the homeless man a nice warm blanket.,"{hard shopper, waiting in, the long, long line, shopper was, die hard, line outside, was waiting, in the, a die}","{was miserably, it was, miserably cold}","{shivering in, a homeless, shopper saw, the alleyway, homeless man, man shivering, the shopper, saw a, in the}","{his car, from his, he gave, and brought, place in, gift back, back from, in the, up his, gave up, a gift, line and, brought a, his place, the line}","{shopper gave, gave the, a nice, the homeless, homeless man, the shopper, warm blanket, nice warm, man a}",True,True,False,True
18,0a9e9018-c1dc-485d-adbe-c1e18f5e022f,Board game night,Jeff invited his friends over to play board games on Saturday night.,They arrived at his house early that evening.,The six of them sat around a big table.,They took turns deciding which game to play.,They spent six hours playing different board games.,"{invited his, over to, saturday night, jeff invited, play board, to play, on saturday, board games, games on, his friends, friends over}","{early that, at his, arrived at, that evening, his house, they arrived, house early}","{six of, them sat, sat around, around a, the six, a big, big table, of them}","{took turns, to play, deciding which, game to, turns deciding, which game, they took}","{they spent, spent six, hours playing, board games, playing different, different board, six hours}",False,True,False,False
24,daed2b60-8579-4de5-b608-c8bdd1529692,Jeff moving,Jeff wanted to move out of his house.,He had no money to pay for a new one.,One day he bought a scratching ticket.,He won enough money for a down payment.,Jeff ended up moving to a new house.,"{to move, of his, move out, jeff wanted, his house, wanted to, out of}","{a new, pay for, for a, new one, no money, money to, had no, he had, to pay}","{he bought, bought a, one day, a scratching, scratching ticket, day he}","{he won, money for, enough money, down payment, a down, won enough, for a}","{up moving, a new, jeff ended, new house, to a, moving to, ended up}",False,False,True,False
