In [9]:
import os
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import nltk
import re
import difflib

from IPython.display import display, HTML
# !pip install cdifflib
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/peshmerge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/peshmerge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define required methods

Taken from `text_simplification.ipynb` and `simplified_summary_pipeline.ipynb`

## For simplification

In [10]:
# preprocess test_data/raw test_data/tokenized test_data/binarized
def preprocess_data(raw_data_dir, tokenized_data_dir, binarized_data_dir):
    !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/test.src --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/test.tok.src
    !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/test.dst --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/test.tok.dst 

    # !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/valid.src --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/valid.tok.src 
    # !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/valid.dst --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/valid.tok.dst 


    # !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/train.src --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/train.tok.src 
    # !python simplification/preprocess/anonymize_wordpiece.py --input $raw_data_dir/train.dst --vocab simplification/preprocess/vocab.txt --output  $tokenized_data_dir/train.tok.dst 


    # Creates binarized fairseq dataset
    #  !python simplification/preprocess.py --workers 5 --source-lang src --target-lang dst --trainpref $tokenized_data_dir/train.tok --validpref $tokenized_data_dir/valid.tok --testpref $tokenized_data_dir/test.tok --destdir  $binarized_data_dir --padding-factor 1 --joined-dictionary --srcdict simplification/preprocess/vocab_count.txt
    !python simplification/preprocess.py --workers 5 --source-lang src --target-lang dst --testpref $tokenized_data_dir/test.tok --destdir  $binarized_data_dir --padding-factor 1 --joined-dictionary --srcdict simplification/preprocess/vocab_count.txt


In [11]:
def generate_simplified_text(binarized_data_dir, output_file, checkpoint_file, gpu_id="", split="test"):
    !export CUDA_VISIBLE_DEVICES=$gpu_id
    !python simplification/generate.py $binarized_data_dir --path $checkpoint_file --batch-size 32  --beam 1 --nbest 1 --user-dir simplification/my_model/ --print-alignment --gen-subset $split > $output_file'.aner'

    !python simplification/postprocess/bpe.py  --out_anon $output_file'.aner' --denon $output_file --ignore_lines 5 --wp 1

    !rm $output_file'.aner'

## For summarization

In [12]:
def read_file(file_name):
    file = open(file_name, "r")
    return nltk.tokenize.sent_tokenize(file.read())

In [13]:

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [14]:
def generate_summary(file_name):
    stop_words = stopwords.words('english')

    # Step 1 - Read text anc split it
    sentences =  read_file(file_name)

    # The number of sentences in the summarization
    ranked_sentences_total = (len(sentences)//2)+1

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    rnk_sent_with_score = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

    # Restore the original sentence order
    ranked_sentences=list(dict(sorted((dict(map((lambda sentence: (
        sentences.index(sentence),sentence) if sentence in sentences else (None,sentence)), dict(rnk_sent_with_score[:ranked_sentences_total]).values()))).items())).values())

    return "\n".join(ranked_sentences)

# Pipeline 1 - Simplification -> summarization

In [15]:
%%time
data_dir = 'data/wiki-auto'
checkpoint_file = 'checkpoints/checkpoint_wiki_auto.pt'
# count = 2

# base data/wiki-auto
for folder in os.listdir(data_dir):
	# raw data = data/wiki-auto/{id}
	base_data_dir = os.path.join(data_dir, folder)

	# pipeline outputs = data/wiki-auto/{id}/simplify_summary
	output_dir = os.path.join(base_data_dir, 'simplify_summary')
	os.makedirs(output_dir, exist_ok=True)
	
	# raw data = data/wiki-auto/{id}/simplify_summary/raw
	# tokenized data = data/wiki-auto/{id}/simplify_summary/tokenized
	# binarized data = data/wiki-auto/{id}/simplify_summary/binarized
	raw_data_dir = os.path.join(output_dir, 'raw')
	tokenized_data_dir = os.path.join(output_dir, 'tokenized')
	binarized_data_dir = os.path.join(output_dir, 'binarized')

	!rm -rf $raw_data_dir
	!rm -rf $tokenized_data_dir
	!rm -rf $binarized_data_dir

	os.makedirs(raw_data_dir, exist_ok=True)
	os.makedirs(tokenized_data_dir, exist_ok=True)
	os.makedirs(binarized_data_dir, exist_ok=True)

	# copy source.txt to simplify_summary/raw
	!cp $base_data_dir/source.txt $raw_data_dir/train.src
	!cp $base_data_dir/source.txt $raw_data_dir/train.dst
	!cp $base_data_dir/source.txt $raw_data_dir/test.src
	!cp $base_data_dir/source.txt $raw_data_dir/test.dst
	!cp $base_data_dir/source.txt $raw_data_dir/valid.src
	!cp $base_data_dir/source.txt $raw_data_dir/valid.dst
	
	# outputs in data/wiki-auto/{id}/simplify_summary/*.txt
	simplification_output_file = os.path.join(output_dir, 'simplified.txt')
	simplified_summary_file = os.path.join(output_dir, 'simplified_summary.txt')

	preprocess_data(raw_data_dir, tokenized_data_dir, binarized_data_dir)
	generate_simplified_text(binarized_data_dir, simplification_output_file, checkpoint_file, 0, "test")

	simplified_summary = generate_summary(simplification_output_file)
	with open(simplified_summary_file, 'w') as f:
		f.write(simplified_summary)
	
	# count -= 1
	# if count == 0:
	# 	break


Namespace(alignfile=None, cpu=False, criterion='cross_entropy', dataset_impl='cached', destdir='data/wiki-auto/1004/simplify_summary/binarized', fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=True, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer='nag', padding_factor=1, seed=1, source_lang='src', srcdict='simplification/preprocess/vocab_count.txt', target_lang='dst', task='translation', tbmf_wrapper=False, tensorboard_logdir='', testpref='data/wiki-auto/1004/simplify_summary/tokenized/test.tok', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, trainpref=None, user_dir=None, validpref=None, workers=5)
| [src] Dictionary: 30525 types
| [src] data/wiki-auto/1004/simplify_summary/tokenized/test.tok.src: 33 sents, 1288 tokens, 0.0% replaced by <unk>
| [dst] Dictionary: 30525

# Pipeline 2 - Summarization -> simplification

In [6]:
data_dir = 'data/wiki-auto'
checkpoint_file = 'checkpoints/checkpoint_wiki_auto.pt'
# count = 2

for folder in os.listdir(data_dir):
	# raw data = data/wiki-auto/{id}
	base_data_dir = os.path.join(data_dir, folder)

	# pipeline outputs = data/wiki-auto/{id}/summary_simplify
	output_dir = os.path.join(base_data_dir, 'summary_simplify')
	os.makedirs(output_dir, exist_ok=True)
	
	# raw data = data/wiki-auto/{id}/summary_simplify/raw
	# tokenized data = data/wiki-auto/{id}/summary_simplify/tokenized
	# binarized data = data/wiki-auto/{id}/summary_simplify/binarized
	raw_data_dir = os.path.join(output_dir, 'raw')	
	tokenized_data_dir = os.path.join(output_dir, 'tokenized')
	binarized_data_dir = os.path.join(output_dir, 'binarized')

	!rm -rf $raw_data_dir
	!rm -rf $tokenized_data_dir
	!rm -rf $binarized_data_dir
	
	os.makedirs(raw_data_dir, exist_ok=True)
	os.makedirs(tokenized_data_dir, exist_ok=True)
	os.makedirs(binarized_data_dir, exist_ok=True)
	
	# outputs in data/wiki-auto/{id}/summary_simplify/*.txt
	summary_output_file = os.path.join(output_dir, 'summary.txt')
	simplified_summary_file = os.path.join(output_dir, 'simplified_summary.txt')
	
	summary = generate_summary(os.path.join(base_data_dir, 'source.txt'))
	with open(summary_output_file, 'w') as src_f:
		src_f.write(summary)
	
	# copy summary.txt as raw/test.src and raw/test.dst
	!cp $summary_output_file $raw_data_dir/test.src
	!cp $summary_output_file $raw_data_dir/test.dst
	!cp $summary_output_file $raw_data_dir/train.src
	!cp $summary_output_file $raw_data_dir/train.dst
	!cp $summary_output_file $raw_data_dir/valid.src
	!cp $summary_output_file $raw_data_dir/valid.dst

	preprocess_data(raw_data_dir, tokenized_data_dir, binarized_data_dir)
	generate_simplified_text(binarized_data_dir, simplified_summary_file, checkpoint_file, 0, "test")
	
	# count -= 1
	# if count == 0:
	# 	break

NameError: name 'read_file' is not defined