In [1]:
import os
import sys
import re
import pandas as pd
from transformers import pipeline

from ngrams import *
from ldatops import *
from parse import *
from run_bert import *
from run_spacy import *

# Enter path to desired text file below

In [2]:
pth = "test2.txt"

# Peforming Pre-processing of book and displaying first 50 characters of text

In [3]:
book, sents = preprocess(pth)
book[:50]

'PSYCHOLOGY AND LIFE I Due world of science and lea'

# Extracting unigrams, bigrams, and trigrams from text and displaying the first 5 terms and their associated confidence scores

In [4]:
grams, gprobs = get_grams(sents)
print(grams[:5])
print(gprobs[:5])

['psychology', 'must', 'facts', 'may', 'life']
[0.06937426713569982, 0.07654767828744867, 0.05709753695529087, 0.032849376399104575, 0.03151618519702946]


# Extracting topics using Latent Dirichlet Association, and displaying the first 5 terms with associated confidence scores

In [5]:
ldatops, ldaprobs = get_lda(sents)
print(ldatops[:5])
print(ldaprobs[:5])

['psychological', 'facts', 'psychology', 'one', 'every']
[0.036000000000000004, 0.041999999999999996, 0.118, 0.039, 0.014]


# Extracting topics using BERTopic, and displaying the first 5 terms with associated confidence scores

In [6]:
btops, bprob = get_bert(sents)
print(btops[:5])
print(bprob[:5])

['art', 'artistic', 'arts', 'painting', 'drawings']
[0.6334439516067505, 0.6284656524658203, 0.591050386428833, 0.558627724647522, 0.5397982001304626]


# Extracting named entities using spaCy, and displaying the first 5 term categories

In [7]:
ents = get_spacy(book)
print(ents[:5])

[['PSYCHOLOGY AND LIFE', 'The Pedagogical Seminary', 'Napoleon', 'PSYCHOLOGY', 'microPSYCHOLOGY', 'Filling', 'PSYCHOLOGY AND LIFE', 'The “ Pedagogical Seminary', 'PSYCHOLOGY', 'Hamlet', 'Grooves', 'TIL', 'Napoleon', 'Society for Psychical Research', 'Thucydides', 'Neptune', 'PSYCHOLOGY', 'Cromwell', 'Bacon', 'PSYCHOLOGY AND LIFE 7', 'Lies', 'PSYCHOLOGY AND LIFE 5', 'Want and Feel, Luther and Bismarck', 'Hamlet', 'PSYCHOLOGY AND LIFE', 'Emerson', 'Irony', 'PSYCHOLOGY', 'Byron', 'Meredith', 'the Fellow Took', 'Harvard', 'Renaissance', 'Baldwin, Fully', 'Napoleon', 'generaPSYCHOLOGY', 'PSYCHOLOGY', 'Truth', 'PSYCHOLOGY', 'the “ Pedagogical Seminary', 'influPSYCHOLOGY', 'Beardsley', 'VIT', 'hypotheti3 PSYCHOLOGY', 'Byron', 'PSYCHOLOGY', 'PSYCHOLOGY AND LIFE 19'], ['Washington', 'Egypt', 'Greece', 'Germany', 'Germany', 'Newton', 'Athens', 'Thou', 'LIBRARY', 'Zoellner', 'Newton', 'Richer', 'Moscow', 'Germany', 'Fichte', 'Hello', 'Italy', 'China', 'Label', 'India'], ['Herbert', 'Stanley All',

# Performing hierarchical summarization using HuggingFace's summarization pipeline, and displaying final summary

# !!!!! WARNING, THIS TAKES A LONG TIME TO PROCESS !!!!!

In [8]:
segs = int(len(book) / 1024) + 1
summary = ""
summarizer = pipeline("summarization", model="google-t5/t5-large")

In [None]:
iter = 0
while len(book.split(".")) > 15:
# for q in range(1):
    for i in range(segs):
        if i == 0:
            st = 0
            en = 1024
            outputs = summarizer(book[st:en], max_length=128, clean_up_tokenization_spaces=True)
            summary += outputs[0]['summary_text']
        if i == segs-1:
            st = i + (i*1023)
            remlen = len(book[st:])
            outputs = summarizer(book[st:], max_length=remlen, clean_up_tokenization_spaces=True)
            summary += " " + outputs[0]['summary_text']
        else:
            st = i + (i*1023)
            en = st + 1024
            outputs = summarizer(book[st:en], max_length=128, clean_up_tokenization_spaces=True)
            summary += " " + outputs[0]['summary_text']
    book = summary
    summary = ""
    print("\n***************************\n\n", iter, len(book))
    segs = int(len(book) / 1024) + 1
    iter += 1

print(book)