### PIP Installs

In [None]:
!pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 torchaudio===0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
!pip install transformers sentencepiece
!pip install beautifulsoup4 nltk scipy
!pip install -U scikit-learn
!pip install langdetect==1.0.9 googletrans==3.1.0a0

### Abstractive

In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [3]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [4]:
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [5]:
text = """
Once upon a time, there lived a rabbit and tortoise. The rabbit could run fast. He was very proud of his speed. While the turtle was slow and consistent. 

One day that tortoise came to meet him. The tortoise was walking very slow as usual. The rabbit looked and laughed at him. 

The tortoise asked “what happened?”

The rabbit replied, “You walk so slowly! How can you survive like this?”. 

The turtle listened to everything and felt humiliated by the rabbit’s words. 

The tortoise replied, “Hey friend! You are very proud of your speed. Let’s have a race and see who is faster”. 
The rabbit was surprised by the challenge of the tortoise. But he accepted the challenge as he thought it would be a cakewalk for him.

So, the tortoise and rabbit started the race. The rabbit was as usual very fast and went far away. While the tortoise was left behind. 

After a while, the rabbit looked behind. 

He said to himself, “The slow turtle will take ages to come near me. I should rest a bit”. 

The rabbit was tired from running fast. The sun was high too. He ate some grass and decided to take a nap. 

He said to himself, “I am confident; I can win even if the tortoise passes me. I should rest a bit”. With that thought, he slept and lost the track of time.

Meanwhile, the slow and steady turtle kept on moving. Although he was tired, he didn’t rest. 

Sometime later, he passed the rabbit when the rabbit was still sleeping. 

The rabbit suddenly woke up after sleeping for a long time. He saw that the tortoise was about to cross the finishing line. 

He started running very fast with his full energy. But it was too late. 

The slow turtle had already touched the finishing line. He has already won the race. 

The rabbit was very disappointed with himself while the tortoise was very happy to win the race with his slow speed. He could not believe his eyes. He was shocked by the end results.

At last, the tortoise asked the rabbit “Now who is faster”. The rabbit had learned his lesson. He could not utter a word. The tortoise said bye to the rabbit and left that place calmly and happily.

“Failure of one time is not a failure of always, provided, one should take the lesson and correct the mistakes”

Slow and steady always wins the race. Never give up. Always keep going. Even if you are slow, your steadiness and consistency will let you win in any situation. Like the tortoise did. 

"""

tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
summary = model.generate(**tokens)
tokenizer.decode(summary[0])

'Slow and steady always wins the race. “Failure of one time is not a failure of always, provided, one should take the lesson and correct the mistakes” Slow and steady always wins the race.'

### Extractive

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse.linalg import svds

[nltk_data] Downloading package stopwords to /home/naman/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/naman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

def low_rank_svd(matrix, singular_count):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [8]:
def extractive_summariser(DOCUMENT):
    nltk.download('punkt')

    DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT)
    DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
    DOCUMENT = DOCUMENT.strip()

    sentences = nltk.sent_tokenize(DOCUMENT)

    normalize_corpus = np.vectorize(normalize_document)

    norm_sentences = normalize_corpus(sentences)

    tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
    dt_matrix = tv.fit_transform(norm_sentences)
    dt_matrix = dt_matrix.toarray()

    vocab = tv.get_feature_names_out()
    td_matrix = dt_matrix.T

    pd.DataFrame(np.round(td_matrix, 2), index=vocab)

    l = len(sentences)

    if(l <= 2):
        return ("\n".join(np.array(sentences)))

    num_sentences = 0

    if(l < 10):
        num_sentences = 3
    elif(l < 100):
        num_sentences = int(l/3)
    elif(l < 500):
        num_sentences = int(l/8)
    elif(l < 1000):
        num_sentences = int(l/14)
    else:
        num_sentences = 72

    u, s, vt = low_rank_svd(td_matrix, 2)  

    term_topic_mat, singular_values, topic_document_mat = u, s, vt

    # remove singular values below threshold                                         
    sv_threshold = 0.5
    min_sigma_value = max(singular_values) * sv_threshold
    singular_values[singular_values < min_sigma_value] = 0

    salience_scores = np.sqrt(np.dot(np.square(singular_values), 
                                    np.square(topic_document_mat)))

    top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
    top_sentence_indices.sort()
    
    # print("Summary Length: " , num_sentences)
    return ("\n".join(np.array(sentences)[top_sentence_indices]))

In [9]:
DOCUMENT = """
Once upon a time, there lived a rabbit and tortoise. The rabbit could run fast. He was very proud of his speed. While the turtle was slow and consistent. 

One day that tortoise came to meet him. The tortoise was walking very slow as usual. The rabbit looked and laughed at him. 

The tortoise asked “what happened?”

The rabbit replied, “You walk so slowly! How can you survive like this?”. 

The turtle listened to everything and felt humiliated by the rabbit’s words. 

The tortoise replied, “Hey friend! You are very proud of your speed. Let’s have a race and see who is faster”. 
The rabbit was surprised by the challenge of the tortoise. But he accepted the challenge as he thought it would be a cakewalk for him.

So, the tortoise and rabbit started the race. The rabbit was as usual very fast and went far away. While the tortoise was left behind. 

After a while, the rabbit looked behind. 

He said to himself, “The slow turtle will take ages to come near me. I should rest a bit”. 

The rabbit was tired from running fast. The sun was high too. He ate some grass and decided to take a nap. 

He said to himself, “I am confident; I can win even if the tortoise passes me. I should rest a bit”. With that thought, he slept and lost the track of time.

Meanwhile, the slow and steady turtle kept on moving. Although he was tired, he didn’t rest. 

Sometime later, he passed the rabbit when the rabbit was still sleeping. 

The rabbit suddenly woke up after sleeping for a long time. He saw that the tortoise was about to cross the finishing line. 

He started running very fast with his full energy. But it was too late. 

The slow turtle had already touched the finishing line. He has already won the race. 

The rabbit was very disappointed with himself while the tortoise was very happy to win the race with his slow speed. He could not believe his eyes. He was shocked by the end results.

At last, the tortoise asked the rabbit “Now who is faster”. The rabbit had learned his lesson. He could not utter a word. The tortoise said bye to the rabbit and left that place calmly and happily.

“Failure of one time is not a failure of always, provided, one should take the lesson and correct the mistakes”

Slow and steady always wins the race. Never give up. Always keep going. Even if you are slow, your steadiness and consistency will let you win in any situation. Like the tortoise did. 

"""

summary = extractive_summariser(DOCUMENT)
print(summary)

Once upon a time, there lived a rabbit and tortoise.
The rabbit could run fast.
He was very proud of his speed.
While the turtle was slow and consistent.
The tortoise was walking very slow as usual.
The rabbit looked and laughed at him.
You are very proud of your speed.
The rabbit was surprised by the challenge of the tortoise.
So, the tortoise and rabbit started the race.
After a while, the rabbit looked behind.
I should rest a bit”.
The rabbit was tired from running fast.
I should rest a bit”.
The slow turtle had already touched the finishing line.
The rabbit was very disappointed with himself while the tortoise was very happy to win the race with his slow speed.
At last, the tortoise asked the rabbit “Now who is faster”.


[nltk_data] Downloading package punkt to /home/naman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Translate

In [10]:
from langdetect import detect
from googletrans import Translator

### Summary

In [11]:
import os
from bs4 import BeautifulSoup

In [15]:
# Get the list of all files and directories
path = "/home/naman/Documents/projects/mtase/dark_web/testing"
dir_list = os.listdir(path)
print(dir_list)

['l5satz5qzvht225nogtlv42gsq7hruzgi5rw2whal6xt5iq2t5gncjyd.onion', 'wubugnkrzwfyq5oj.onion', 'fr-be.wordpress.org']


In [None]:
html_files = {}

for i in dir_list:
	html_files.update({i : []})
	path = "/home/naman/Documents/projects/mtase/dark_web/testing/" + i
	file_list = os.listdir(path)
	# print(file_list)
	for j in file_list:
		if (j.endswith(".html")):
			html_files[i].append(j)

print(html_files)

In [None]:
folders = html_files.keys()
path = "/home/naman/Documents/projects/mtase/dark_web/testing/"

for folder in folders:
	files = html_files[folder]
	os.mkdir(path + f"summary_{folder}")
	print(f'\nIn folder {folder}')
	for file in files:
		# get file
		# print(f'path: {path}')
		# print(f'file: {file}')
		
		new_path = path + folder + "/"
		save_path = path + f"summary_{folder}" + "/"
		
		# get contents
		page = open(new_path+file, "r")
		soup = BeautifulSoup(page, "html.parser")
		html_text = soup.get_text()
		for script in soup(["script", "style"]):
			script.extract()
		
		# parse content
		text = soup.get_text()
		lines = (line.strip() for line in text.splitlines())
		chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
		text = '\n'.join(chunk for chunk in chunks if chunk)

		# translate to english
		result_lang = detect(text)
		# print(f"result lang: {result_lang}")
		# print(f'before text: {text}')
		if result_lang != "en":
			translator= Translator()
			translation = translator.translate(text, src=result_lang, dest='en')
			text = translation.text
			# print(f'after text: {text}')
		# text = detect_and_translate(text, target_lang='en')

		# extractive summary
		extractive = extractive_summariser(text)

		# abtractive summary
		tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
		summary = model.generate(**tokens)
		abstractive = tokenizer.decode(summary[0])
		fin_text = f'Abstractive: {abstractive}\n\n\nExtractive: {extractive}'

		# write to file
		with open(save_path+file.split()[0]+".txt", "w+") as f:
			f.write(fin_text)
		
		print(f'Summary for {new_path+file} done')