In [28]:
# Natural Language Tool Kit (NLTK)
import re
import os
import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
from google.colab import files
uploaded = files.upload()

Saving 0a000f521734500aa360bf6bb2bb31446e4d66cd.story to 0a000f521734500aa360bf6bb2bb31446e4d66cd.story
Saving 0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story to 0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story
Saving 0a0aa464d262b903f44b0f8eaa67f13dd1946cfd.story to 0a0aa464d262b903f44b0f8eaa67f13dd1946cfd.story
Saving 0a0adc84ccbf9414613e145a3795dccc4828ddd4.story to 0a0adc84ccbf9414613e145a3795dccc4828ddd4.story
Saving 0a0b44620d0dec6fdfc1aa139ff51bcb36c56c51.story to 0a0b44620d0dec6fdfc1aa139ff51bcb36c56c51.story
Saving 0a0b59738a88e97ef66322a8c866d22ebf079289.story to 0a0b59738a88e97ef66322a8c866d22ebf079289.story
Saving 0a0c2f4e07ba05f3226e8afec1350ac82161cd2e.story to 0a0c2f4e07ba05f3226e8afec1350ac82161cd2e.story
Saving 0a0d11c125e4e4b061061f44b08f60cb5b8bf177.story to 0a0d11c125e4e4b061061f44b08f60cb5b8bf177.story
Saving 0a0f56ebc5a0a67ed18de79d99b40a42d8058d04.story to 0a0f56ebc5a0a67ed18de79d99b40a42d8058d04.story
Saving 0a1ad82d161d90d758240407cb8c8fcebff4a212.story to 0a1ad82

In [29]:
Top_N = 4
stop_words = nltk.corpus.stopwords.words('english')

def get_word_count_dict(input_doc_text):
    # create an empty dictionary to house the word count
    word_count_dict = {}
    for this_word in nltk.word_tokenize(input_doc_text):
        if this_word in stop_words:
            continue
        if this_word not in word_count_dict.keys():
            word_count_dict[this_word] = 1
        else:
            word_count_dict[this_word] += 1
    return word_count_dict

def clean_sentence(input_text):
    # output_text = input_text.copy()
    output_text = input_text
    output_text = re.sub(r'\[[0-9]*\]',' ',output_text) # Replace Numbers
    output_text = re.sub(r'\s+',' ',output_text) # avoiding double+ spaces
    output_text = output_text.lower()
    # replace characters other than [a-zA-Z0-9],
    # digits & one or more spaces with single space
    regex_patterns = [r'\W',r'\d',r'\s+']
    for regex in regex_patterns:
        output_text = re.sub(regex,' ',output_text)
    return output_text

def get_sentence_score(input_sentence, word_count_dict):
    word_list = nltk.word_tokenize(input_sentence.lower())
    score = 0
    if len(word_list) > 30:
        return score
    for this_word in word_list:
        score += word_count_dict.get(this_word, 0)
    return score


def get_extractive_summary(input_doc, n=Top_N):
    cleaned_doc = clean_sentence(input_doc)
    word_dict = get_word_count_dict(cleaned_doc)
    input_sentences = nltk.sent_tokenize(input_doc)
    sentence_df = pd.DataFrame(input_sentences, columns=['Sentence'])
    sentence_df['CleanedSentence'] = sentence_df['Sentence'].apply(clean_sentence)
    sentence_df['Score'] = sentence_df['CleanedSentence'].apply(lambda x: get_sentence_score(x, word_dict))
    sentence_df.sort_values(by='Score', ascending = False, inplace=True)
    summary = '\n'.join(sentence_df['Sentence'].iloc[0:n].tolist())
    return summary

def read_text_file(text_file_path):
    print('Reading file in ', text_file_path)
    with open(text_file_path, 'r') as f:
        text_data = f.read()
    return text_data


def save_text_file(text_data, output_file_path):
    print('Saving file in ', output_file_path)
    with open(output_file_path, 'w') as f:
        f.write(text_data)


In [32]:
folder_path = './data/test'
output_path = './data/test_summary'
TOP_N = 4

all_files_data = []
for this_file in tqdm(os.listdir(folder_path)):
    this_file_data = read_text_file(os.path.join(folder_path, this_file))
    this_summary = get_extractive_summary(this_file_data, Top_N)
    save_text_file(this_summary, os.path.join(output_path, this_file))

100%|██████████| 10/10 [00:00<00:00, 57.94it/s]

Reading file in  ./data/test/0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story
Saving file in  ./data/test_summary/0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story
Reading file in  ./data/test/0a0f56ebc5a0a67ed18de79d99b40a42d8058d04.story
Saving file in  ./data/test_summary/0a0f56ebc5a0a67ed18de79d99b40a42d8058d04.story
Reading file in  ./data/test/0a1ad82d161d90d758240407cb8c8fcebff4a212.story
Saving file in  ./data/test_summary/0a1ad82d161d90d758240407cb8c8fcebff4a212.story
Reading file in  ./data/test/0a000f521734500aa360bf6bb2bb31446e4d66cd.story
Saving file in  ./data/test_summary/0a000f521734500aa360bf6bb2bb31446e4d66cd.story
Reading file in  ./data/test/0a0d11c125e4e4b061061f44b08f60cb5b8bf177.story
Saving file in  ./data/test_summary/0a0d11c125e4e4b061061f44b08f60cb5b8bf177.story
Reading file in  ./data/test/0a0b44620d0dec6fdfc1aa139ff51bcb36c56c51.story
Saving file in  ./data/test_summary/0a0b44620d0dec6fdfc1aa139ff51bcb36c56c51.story
Reading file in  ./data/test/0a0adc84ccbf94146




In [33]:
print(this_summary)

On Pi Day, one number 'reeks of mystery'

Math may be scary, but pi is not -- as evidenced by the widespread revelry on Pi Day.
The parade ends at the "pi shrine" -- a pi symbol with digits spiraling around it embedded in the sidewalk, which was unveiled last year.
On Pi Day, is 'pi' under attack?
But if you happen to live in a particularly pi-happy place, you might be able to take part in some larger-scale, pi-inspired activities.
