In [1]:
import os
from pathlib import Path
import pickle
from tqdm.notebook import tqdm
import re
from rouge import Rouge 
import pandas as pd

In [2]:
data_path = 'data/cnn/stories/'
pickle_path = 'pickle/'

In [3]:
#check one story format
file_name = os.listdir(data_path)[0]

story = Path(os.path.join(data_path,file_name)).read_text()
    
print(story)

At the start of a big week for the Higgs boson, the most sought-after particle in all of physics, scientists in Illinois said Monday that they had crept closer to proving that the particle exists but had been unable to reach a definitive conclusion.

The scientists outlined their final analysis based on more than 10 years of research and 500 trillion particle collisions using the U.S. Department of Energy's Fermilab Tevatron collider near Batavia, Illinois, whose budgetary woes shut it down last year.

What is the Higgs boson and why is it important?

Their announcement came two days before researchers at the Large Hadron Collider under the Alps are due to unveil their latest results at an eagerly awaited seminar at the CERN particle physics laboratory in Geneva, Switzerland.

"Our data strongly point toward the existence of the Higgs boson," Rob Roser, a spokesman for one of two independent experiments at the Tevatron, said in a statement. "But it will take results from the experiment

In [4]:
#split each story and the hightlight
def split_story(story):
    idx = story.find("@highlight")
    stry = story[:idx]
    summaries = story[idx + len("@highlight"):]
    ls_summaries = summaries.split('@highlight')
    ls_summaries = [l.strip() for l in ls_summaries]
    return {'story':stry,'highlights':ls_summaries}
    
#read each stroy from the disk, and convert to a list of dicts of separate story and highlight
def get_stories_highlights_data(path):
    ls_files = os.listdir(path)
    dict_stories_highlights = []
    for fl in tqdm(ls_files):
        story = Path(os.path.join(path,fl)).read_text()
        story = split_story(story)
        dict_stories_highlights.append(story)
        
    return dict_stories_highlights

stories = get_stories_highlights_data(data_path)

  0%|          | 0/92579 [00:00<?, ?it/s]

In [5]:
print('There are {} stories:'.format(len(stories)))
stories[0]

There are 92579 stories:


{'story': 'At the start of a big week for the Higgs boson, the most sought-after particle in all of physics, scientists in Illinois said Monday that they had crept closer to proving that the particle exists but had been unable to reach a definitive conclusion.\n\nThe scientists outlined their final analysis based on more than 10 years of research and 500 trillion particle collisions using the U.S. Department of Energy\'s Fermilab Tevatron collider near Batavia, Illinois, whose budgetary woes shut it down last year.\n\nWhat is the Higgs boson and why is it important?\n\nTheir announcement came two days before researchers at the Large Hadron Collider under the Alps are due to unveil their latest results at an eagerly awaited seminar at the CERN particle physics laboratory in Geneva, Switzerland.\n\n"Our data strongly point toward the existence of the Higgs boson," Rob Roser, a spokesman for one of two independent experiments at the Tevatron, said in a statement. "But it will take results

In [6]:
#store in a pickle file
with open(pickle_path + "stories.pkl", 'wb') as file:
    pickle.dump(stories, file)

In [7]:
#pre-process stories to remove the punctuation,integers and convert to lower case
def preprocess_story(story):
    story['story'] = story['story'].split('\n')
    temp = []
    for sen in story['story']:
        sen = sen.replace('(CNN)','')
        sen = sen.lower()
        sen = re.sub('[.]','',sen)
        sen = re.sub('[!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n\'\d+]','',sen)
        if len(sen) > 0:
            temp.append(sen)
    story['story'] = temp
    
    temp = []
    for highlight in story['highlights']:
        highlight = highlight.replace('(CNN)','')
        highlight = highlight.lower()
        highlight = re.sub('[.]','',highlight)
        highlight = re.sub('[!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n\'\d+]','',highlight)
        if len(highlight) > 0:
            temp.append(highlight)
    story['highlights'] = temp  
    return story

def preprocess_stories(stories):
    preprocessed_stories = []
    for story in tqdm(stories):
        preprocessed_stories.append(preprocess_story(story))
    return preprocessed_stories


In [8]:
#calculate the rogue score between each sentence of the story and the highlights
#get the sentences with the top 5 rogue scores
def get_top_five_highlights(story):
    ls = []
    rogue = Rouge()
    for hypothesis in story['story']:
        for reference in story['highlights']:
            scores = rogue.get_scores(hypothesis, reference)
            ls.append((hypothesis, scores[0]['rouge-1']['f']))

    ls.sort(reverse=True, key = lambda x: x[1]) 
    return [x[0] for x in ls[:5]],[x[1] for x in ls[:5]]

In [11]:
#convert to a data frame with story id, sentence id, sentence and lable. 
#Label should be one if the sentence rogue score is the top 5, else it should be 0
def get_dataframe(stories):
    story_ls = []
    story_id = 0
    for story in stories:
        top_five_sentences, top_five_scores = get_top_five_highlights(story)
        sentence_id = 0
        for sentence in story['story']:
            if sentence in top_five_sentences:
                story_ls.append({'story_id':story_id,'sentence_id':sentence_id,'sentence':sentence, 'label':1})
            else:
                story_ls.append({'story_id':story_id,'sentence_id':sentence_id,'sentence':sentence, 'label':0})
            sentence_id = sentence_id + 1
        story_id = story_id + 1
        if story_id % 5000 == 0:
            print("completed:", story_id)
    
    df = pd.DataFrame(story_ls)
    return df

In [10]:
#preprocess the stories and store in the pickle format
preprocessed_stories = preprocess_stories(stories)

  0%|          | 0/92579 [00:00<?, ?it/s]

In [13]:
#convert to a dataframe
df = get_dataframe(preprocessed_stories)
df.head()

completed: 5000
completed: 10000
completed: 15000
completed: 20000
completed: 25000
completed: 30000
completed: 35000
completed: 40000
completed: 45000
completed: 50000
completed: 55000
completed: 60000
completed: 65000
completed: 70000
completed: 75000
completed: 80000
completed: 85000
completed: 90000


Unnamed: 0,story_id,sentence_id,sentence,label
0,0,0,at the start of a big week for the higgs boson...,0
1,0,1,the scientists outlined their final analysis b...,0
2,0,2,what is the higgs boson and why is it important,0
3,0,3,their announcement came two days before resear...,0
4,0,4,our data strongly point toward the existence o...,0


In [14]:
#look at one story in dataframe
df[df['story_id'] == 0]

Unnamed: 0,story_id,sentence_id,sentence,label
0,0,0,at the start of a big week for the higgs boson...,0
1,0,1,the scientists outlined their final analysis b...,0
2,0,2,what is the higgs boson and why is it important,0
3,0,3,their announcement came two days before resear...,0
4,0,4,our data strongly point toward the existence o...,0
5,0,5,read more the woman at the edge of physics,1
6,0,6,finding the higgs boson would help explain the...,1
7,0,7,the particle has been so difficult to pin down...,0
8,0,8,more science news from cnn light years,0
9,0,9,the results from the tevatron stemming from th...,1


In [None]:
#now store the dataframe
with open(pickle_path + 'stories_df.pkl','wb') as file:
    pickle.dump(df,file)