In [9]:
import os
import sys
import re
import pandas as pd
from transformers import pipeline

from ngrams import *
from ldatops import *
from parse import *
from run_bert import *
from run_spacy import *

# Enter path to desired text file below

In [2]:
pth = "test2.txt"

# Peforming Pre-processing of book and displaying first 50 characters of text

In [3]:
book, sents = preprocess(pth)
book[:50]

# Extracting unigrams, bigrams, and trigrams from text and displaying the first 5 terms and their associated confidence scores

In [4]:
grams, gprobs = get_grams(sents)
print(grams[:5])
print(gprobs[:5])

['psychology', 'must', 'facts', 'may', 'life']
[0.06937426713569982, 0.07654767828744867, 0.05709753695529087, 0.032849376399104575, 0.03151618519702946]


# Extracting topics using Latent Dirichlet Association, and displaying the first 5 terms with associated confidence scores

In [5]:
ldatops, ldaprobs = get_lda(sents)
print(ldatops[:5])
print(ldaprobs[:5])

['psychology', 'facts', 'psychological', 'one', 'different']
[0.102, 0.041, 0.044, 0.039, 0.008]


# Extracting topics using BERTopic, and displaying the first 5 terms with associated confidence scores

In [6]:
btops, bprob = get_bert(sents)
print(btops[:5])
print(bprob[:5])

['art', 'artistic', 'arts', 'painting', 'drawings']
[0.6334439516067505, 0.6284656524658203, 0.591050386428833, 0.558627724647522, 0.5397981405258179]


# Extracting named entities using spaCy, and displaying the first 5 term categories

In [7]:
ents = get_spacy(book)
print(ents[:5])

[['PSYCHOLOGY AND LIFE', 'Byron', 'Emerson', 'Grooves', 'Truth', 'PSYCHOLOGY', 'Lies', 'Society for Psychical Research', 'PSYCHOLOGY', 'microPSYCHOLOGY', 'Filling', 'PSYCHOLOGY AND LIFE', 'TIL', 'Napoleon', 'Baldwin, Fully', 'Napoleon', 'Bacon', 'PSYCHOLOGY', 'Napoleon', 'PSYCHOLOGY', 'the Fellow Took', 'Hamlet', 'Thucydides', 'PSYCHOLOGY', 'Want and Feel, Luther and Bismarck', 'hypotheti3 PSYCHOLOGY', 'Hamlet', 'Byron', 'Neptune', 'generaPSYCHOLOGY', 'The Pedagogical Seminary', 'The “ Pedagogical Seminary', 'PSYCHOLOGY AND LIFE 7', 'Meredith', 'influPSYCHOLOGY', 'Harvard', 'PSYCHOLOGY', 'Irony', 'Beardsley', 'VIT', 'Renaissance', 'Cromwell', 'the “ Pedagogical Seminary', 'PSYCHOLOGY AND LIFE 5', 'PSYCHOLOGY AND LIFE 19', 'PSYCHOLOGY', 'PSYCHOLOGY AND LIFE'], ['India', 'Egypt', 'Washington', 'China', 'Italy', 'Greece', 'Label', 'Germany', 'Thou', 'Fichte', 'Newton', 'LIBRARY', 'Germany', 'Moscow', 'Germany', 'Richer', 'Hello', 'Newton', 'Zoellner', 'Athens'], ['Herbert', 'Henry James',

# Performing hierarchical summarization using HuggingFace's summarization pipeline, and displaying final summary

# !!!!! WARNING, THIS TAKES A LONG TIME TO PROCESS !!!!!

In [10]:
segs = int(len(book) / 1024) + 1
summary = ""
summarizer = pipeline("summarization", model="google-t5/t5-large")

In [None]:
iter = 0
while len(book.split(".")) > 15:
# for q in range(1):
    for i in range(segs):
        if i == 0:
            st = 0
            en = 1024
            outputs = summarizer(book[st:en], max_length=128, clean_up_tokenization_spaces=True)
            summary += outputs[0]['summary_text']
        if i == segs-1:
            st = i + (i*1023)
            remlen = len(book[st:])
            outputs = summarizer(book[st:], max_length=remlen, clean_up_tokenization_spaces=True)
            summary += " " + outputs[0]['summary_text']
        else:
            st = i + (i*1023)
            en = st + 1024
            outputs = summarizer(book[st:en], max_length=128, clean_up_tokenization_spaces=True)
            summary += " " + outputs[0]['summary_text']
    book = summary
    summary = ""
    print("\n***************************\n\n", iter, len(book))
    segs = int(len(book) / 1024) + 1
    iter += 1

print(book)