# Data Exploration

In [1]:
import os, os.path
import subprocess
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(r'/home/ubuntu/stanford-corenlp-full-2018-02-27')

## Test Stanford NLP

ref: https://github.com/Lynten/stanford-corenlp

In [2]:
try:
    sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
    print('Tokenize:', nlp.word_tokenize(sentence))
    print('Part of Speech:', nlp.pos_tag(sentence))
    #print('Named Entities:', nlp.ner(sentence))
    #print('Constituency Parsing:', nlp.parse(sentence))
    #print('Dependency Parsing:', nlp.dependency_parse(sentence))
except Exception as e:
    print(e)
finally:    
    nlp.close()

Tokenize: ['Guangdong', 'University', 'of', 'Foreign', 'Studies', 'is', 'located', 'in', 'Guangzhou', '.']
Part of Speech: [('Guangdong', 'NNP'), ('University', 'NNP'), ('of', 'IN'), ('Foreign', 'NNP'), ('Studies', 'NNPS'), ('is', 'VBZ'), ('located', 'JJ'), ('in', 'IN'), ('Guangzhou', 'NNP'), ('.', '.')]


## Function definitions

In [3]:
def get_file_count(DIR):
    return len(os.listdir(DIR))


def read_story(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip())
    return lines


def split(lines):
    article_lines = []
    highlights = []
    next_is_highlight = False
    for idx,line in enumerate(lines):
        if line == "":
            continue 
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)
    return article_lines,highlights        


dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'


def fix_missing_period(line):
    if "@highlight" in line: return line
    if line=="": return line
    if line[-1] in END_TOKENS: return line
    return line + " ."

def create_paragraphs(story):
    story = [line.lower() for line in story]
    story = [fix_missing_period(line) for line in story]
    article_lines,highlights = split(story)
    article = ' '.join(article_lines)
    abstract = ' '.join(["%s %s %s" % (SENTENCE_START, sent, SENTENCE_END) for sent in highlights])
    return article, abstract    

## Data summary

Ref: https://cs.nyu.edu/%7Ekcho/DMQA/

In [4]:
cnn_dir = './data/cnn/stories/'
print(get_file_count(cnn_dir))

92579


In [5]:
dm_dir = './data/dailymail/stories'
print(get_file_count(dm_dir))

219506


## Explore CNN Data

In [6]:
cnn_story_1 = './data/cnn/stories/166fe0c3d3f400a48ec5fd716c675384512f152c.story'

In [7]:
out = read_story(cnn_story_1)
out

['HAVANA, Cuba (CNN) -- Cuban President Raul Castro is taking over leadership of a country whose government believes its citizens are not working hard enough.',
 '',
 "Raul Castro was chosen Sunday to take over Cuba's presidency from his brother, Fidel Castro.",
 '',
 'The state-run newspaper recently ran an article headlined "Work: Option or necessity?"',
 '',
 "The writer pointed out that, judging by the number of people in the streets during the day, many Cubans don't seem to be on the job.",
 '',
 "They have few motivations to buckle down: Salaries average about $15 per month on the island, and Cubans get monthly food rations even if they don't work.  Watch a report on the realities in Cuba »",
 '',
 '"There is a strong desire to protect and to gradually increase the incomes and savings of the population, particularly of those least favored," said Raul Castro, 76.',
 '',
 'The black market is so widespread that Cubans have coined a special term for breaking the law to make ends mee

## Seperate summary and article

In [8]:
article_lines,highlights = split(out)

In [9]:
article_lines

['HAVANA, Cuba (CNN) -- Cuban President Raul Castro is taking over leadership of a country whose government believes its citizens are not working hard enough.',
 "Raul Castro was chosen Sunday to take over Cuba's presidency from his brother, Fidel Castro.",
 'The state-run newspaper recently ran an article headlined "Work: Option or necessity?"',
 "The writer pointed out that, judging by the number of people in the streets during the day, many Cubans don't seem to be on the job.",
 "They have few motivations to buckle down: Salaries average about $15 per month on the island, and Cubans get monthly food rations even if they don't work.  Watch a report on the realities in Cuba »",
 '"There is a strong desire to protect and to gradually increase the incomes and savings of the population, particularly of those least favored," said Raul Castro, 76.',
 'The black market is so widespread that Cubans have coined a special term for breaking the law to make ends meet: "resolver" -- literally, "t

In [10]:
highlights

['Cuban President Raul Castro says the country must become more productive',
 'Castro has promised to improve efficiency by cutting some red tape',
 'Expectations rise as a new president leads Cuba for the first time in 49 years',
 'Rare public displays of discontent show frustrations faced by Cubans']

## Join Lines to form paragraphs

In [11]:
article,summary = create_paragraphs(out)

In [12]:
article

'havana, cuba (cnn) -- cuban president raul castro is taking over leadership of a country whose government believes its citizens are not working hard enough. raul castro was chosen sunday to take over cuba\'s presidency from his brother, fidel castro. the state-run newspaper recently ran an article headlined "work: option or necessity?" the writer pointed out that, judging by the number of people in the streets during the day, many cubans don\'t seem to be on the job. they have few motivations to buckle down: salaries average about $15 per month on the island, and cubans get monthly food rations even if they don\'t work.  watch a report on the realities in cuba » . "there is a strong desire to protect and to gradually increase the incomes and savings of the population, particularly of those least favored," said raul castro, 76. the black market is so widespread that cubans have coined a special term for breaking the law to make ends meet: "resolver" -- literally, "to resolve."  see cub

In [13]:
summary

'<s> cuban president raul castro says the country must become more productive . </s> <s> castro has promised to improve efficiency by cutting some red tape . </s> <s> expectations rise as a new president leads cuba for the first time in 49 years . </s> <s> rare public displays of discontent show frustrations faced by cubans . </s>'

## Dailymail Data

In [14]:
dm_story_1 = './data/dailymail/stories/ffffd563a96104f5cf4493cfa701a65f31b06abf.story'
story = read_story(dm_story_1)
article,summary = create_paragraphs(story)

In [15]:
article

"under fire australian defence minister david johnston has stopped short of apologising or withdrawing his controversial 'wouldn't trust them to build a canoe' comments against australian submarine corporation (asc), saying on wednesday that he regrets his 'rhetorical flourish'. amid calls for his sacking, mr johnston told the senate that he never intended to cause offence. 'regrettably, in rhetorical flourish, i did express my frustrations in the past performance of asc,' he said. scroll down for video . 'regrettably, in rhetorical flourish, i did express my frustrations in the past performance of asc,' defence minister david johnston said of his 'canoe' comments against an australian submarine company . under fire. senator johnston has not withdrawn his comments but regretted his rhetorical flourish in which he claimed he would not trust asc to even 'build a canoe' there are fears the naval build program, worth more than $40 billion, could be lost to south australia, with claims that

In [16]:
summary

"<s> defence minister david johnston said his 'i wouldn't trust them to build a canoe' outburst against australian submarine corporation (asc) was a 'rhetorical flourish' </s> <s> prime minister abbott effectively cut loose his minister, defending the asc and claiming it played a vital role in supporting the navy . </s> <s> opposition leader bill shorten has called for mr johnston's sacking . </s> <s> there are fears the $40 billion program will be taken away from south australia and submarines bought from overseas . </s>"