In [None]:
# ALL IMPORTS

# -----------------------------------
# SpaCy IMPORTS
!pip install -U spacy

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

# Check how many stopwords
#len(STOP_WORDS)

# Create a spaCy nlp object
nlp = spacy.load('en_core_web_lg')

# Check if a word is a stopword:
#nlp.vocab['thy'].is_stop

# Add custom stopwords
new_stop_words = ['thy', 'ye', 'thee', 'thou', 'll', 've']
for w in new_stop_words :
  STOP_WORDS.add(w)

# -----------------------------------
# TRANSFORMERS IMPORTS
#!pip install transformers datasets
!pip install --no-cache-dir transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset

# Topic Extractor + Summarization

## Load data / text preprocessing

In [None]:
# Load list of paragraphs
def txt2paragraph(filepath):
    with open(filepath) as f:
        lines = f.readlines()

    paragraph = ''
    for line in lines:
        if line.isspace():  # is it an empty line?
            if paragraph:
                yield paragraph
                paragraph = ''
            else:
                continue
        else:
            paragraph += ' ' + line.strip()
    yield paragraph

# Build the list of raw paragraphs
raw_paragraphs = []
for par in txt2paragraph('nietzsche.txt'):
  par = par.strip()
  raw_paragraphs.append(par)

# Basic preprocessing
import re
def paragraphPreprocess(raw_paragraphs: list):
  paragraphs = []
  prev = ''
  for par in raw_paragraphs:
    # First we exclude short paragraphs and Footnotes
    if (len(par) < 200 and not prev) or "Footnote" in par or 'NOTE' in par or 'Nietzsche' in par: 
      continue 
    # Next remove non-alpha characters at the beginning of each paragraph
    else:
      for c in par:
        if c.isalpha():
          i = par.find(c)
          par = par[i:]
          break
      par = re.sub('[—]', ' ', par)
      par = par.replace('-', ' ')
      par = par.replace('”', '')
      par = par.replace("’", '')
      # Remove text between square brackets: 
      # "[\(\[].*?[\)\]]"  is a REGEX for finding
      # the pattern for brackets containing some content
      par = re.sub("[\(\[].*?[\)\]]","", par)
      par = re.sub('[_\'{}()…="]', '', par)
      par = prev + ' ' + par
      par = par.strip()
      if par[-1] in [':', ';', ','] or par[-1].isalpha():
        prev = par
        continue
      else:
        paragraphs.append(par)
        prev = ''
  return paragraphs

paragraphs = paragraphPreprocess(raw_paragraphs)

# Create a Pandas DataFrame out of our list of paragraphs
import pandas as pd
df = pd.DataFrame(paragraphs, columns =['paragraph'])


# ----------------------------------------------------
### Tokenization and further preprocessing with SpaCy

# Split text into a list of SENTENCES with SpaCy
def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

# Further preprocessing with SpaCy
import string
def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df_clean = pd.DataFrame(df.paragraph.apply(lambda x: clean_text(x)))

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
    
df["paragraph_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['paragraph']), axis=1)
#df.head()

## Topic extraction: Non-negative Matrix Factorization

In [13]:
# Create a DOCUMENT TERM MATRIX
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=3, stop_words=STOP_WORDS)

dtm = tfidf.fit_transform(df['paragraph_lemmatize'])

# Create an instance of NMF with n_comp components
from sklearn.decomposition import NMF
n_comp = 8
nmf_model = NMF(n_components=n_comp,random_state=42)
nmf_model.fit(dtm)

# Print the most common words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-20:]])
    print('\n')
   
# ASSIGN Topic to paragraphs and COUNT paragraphs/topic

topic_results = nmf_model.transform(dtm)
df['topic'] = topic_results.argmax(axis=1)

df.groupby(['topic']).size()

#df.head(10)
#df[df['topic'] == topic].head() # Filter df by topic


The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).


Maximum number of iterations 200 reached. Increase it to improve convergence.



THE TOP WORDS FOR TOPIC #0
['pain', 'high', 'animal', 'feel', 'cause', 'woman', 'time', 'long', 'power', 'nature', 'self', 'soul', 'strong', 'bad', 'thing', 'know', 'virtue', 'great', 'good', 'man']


THE TOP WORDS FOR TOPIC #1
['word', 'silent', 'stand', 'night', 'cry', 'look', 'long', 'laugh', 'day', 'speak', 'mountain', 'cave', 'unto', 'hath', 'like', 'hear', 'heart', 'come', 'spake', 'zarathustra']


THE TOP WORDS FOR TOPIC #2
['time', 'day', 'artist', 'let', 'richard', 'europe', 'culture', 'goethe', 'spirit', 'like', 'great', 'book', 'art', 'taste', 'people', 'germany', 'musician', 'music', 'wagner', 'german']


THE TOP WORDS FOR TOPIC #3
['poet', 'appearance', 'hero', 'drama', 'æsthetic', 'artistic', 'picture', 'greek', 'dream', 'nature', 'phenomenon', 'chorus', 'myth', 'tragic', 'world', 'apollonian', 'tragedy', 'music', 'art', 'dionysian']


THE TOP WORDS FOR TOPIC #4
['faith', 'save', 'jewish', 'hate', 'punishment', 'act', 'child', 'concept', 'evil', 'thing', 'holy', 'shall', 

topic
0    648
1    313
2    199
3    230
4    195
5    353
6    110
7    360
dtype: int64

## Text to Paragraph

In [15]:
def textToParagraph(text):
  """
  Input: text - a string of text
  Output: Full Paragraph from the DataFrame that best matches with input text

  Description: 
  text -> predict Topic -> compute similarity ONLY with paragraphs in this Topic
       -> return paragraph with highest similarity score
  """

  # PREDICT Topic
  text = lemmatizer(clean_text(text))
  X = tfidf.transform([text]) # transform the TF-IDF
  nmf_features = nmf_model.transform(X) # get the nmf_features (score) vector
  topic = nmf_features.argmax()

  # Compute SIMILARITY with paragraphs in this Topic

  # Similarity function: (text, paragraph)
  def sim(text, par):
    # Clean text, remove stopwords and tokenize
    doc = nlp(lemmatizer(clean_text(par)))
    return nlp(text).similarity(doc)

  # Get series of similarity scores on the DataFrame sliced by topic
  scores = df[df['topic'] == topic].apply(lambda x: sim(text, x['paragraph']), axis=1)

  # Return most similar paragraph
  id = scores.idxmax() # Get id of the max score
  return df['paragraph'][id]

# Summarization

In [None]:
# Load summarization pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_1_name = "facebook/bart-large-cnn"
tokenizer_1 = AutoTokenizer.from_pretrained(model_1_name)
model_1 = AutoModelForSeq2SeqLM.from_pretrained(model_1_name)
  
summarizer = pipeline("summarization", model=model_1_name, tokenizer=tokenizer_1)

# TRUNCATE string to TWO SENTENCES
def firstTwoSentences(s: str):
  """
  Input: string s
  Returns: first two sentences in s
  """
  # Truncate string at the last '.' or '?' or '!'
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  LastQmark = reverse.find("?")
  LastXmark = reverse.find("!")
  lastStopSymbol = max(LastFullStop, LastQmark, LastXmark)

  if lastStopSymbol >= 0 :
    s = s[: len(s) - lastStopSymbol]

  # Return the first two sentences
  return ' '.join(split_in_sentences(s)[:2])

# SUMMARY snippet
def summary(paragraph: str):
  """
  Input: paragraph - a string
  Output: summary - a string. The summary of the paragraph.
  Parameters: we set min_length to 10% of the paragraph's length and max_length to 40% respectively
  """
  min_length=len(tokenizer_1(paragraph)['input_ids']) // 10
  max_length= 5*min_length
  summary = summarizer(paragraph, min_length=min_length, max_length=max_length)[0]['summary_text']
  return firstTwoSentences(summary)

In [18]:
# Print text by sentences
def printBySentence(text: str):
  for sent in split_in_sentences(text):
    print(sent)
  return

# TEST SUMMARIZER with a few paragraphs
import random
for step in range(3):
  t = random.randint(0, len(df['paragraph']))
  paragraph = df['paragraph'][t]
  print("PARAGRAPH:")
  printBySentence(paragraph)

  print("\nSUMMARY:")
  printBySentence(summary(paragraph))
  print(100*'-')

PARAGRAPH:
Our deepest insights must  and should  appear as follies, and under certain circumstances as crimes, when they come unauthorizedly to the ears of those who are not disposed and predestined for them.
The exoteric and the esoteric, as they were formerly distinguished by philosophers  among the Indians, as among the Greeks, Persians, and Mussulmans, in short, wherever people believed in gradations of rank and NOT in equality and equal rights  are not so much in contradistinction to one another in respect to the exoteric class, standing without, and viewing, estimating, measuring, and judging from the outside, and not from the inside; the more essential distinction is that the class in question views things from below upwards  while the esoteric class views things FROM ABOVE DOWNWARDS.
There are heights of the soul from which tragedy itself no longer appears to operate tragically; and if all the woe in the world were taken together, who would dare to decide whether the sight of 

# Text to (relevant) paragraph

In [19]:
# TEST

testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')

for text in testQuestion:
  print(100*'-')
  print("Input: ", text, '\n') 
  paragraph = textToParagraph(text)
  printBySentence(summary(paragraph))

----------------------------------------------------------------------------------------------------
Input:  How much power should people have in european politics? 

Can a crasser, more indolent, and more lounging form of Christian belief be imagined, than that of the average German Protestant?
----------------------------------------------------------------------------------------------------
Input:  Can religion save the evil in the world? 

The Christian concept of a god is one of the most corrupt concepts that has ever been set up in the world.
God degenerated into the contradiction of life.
----------------------------------------------------------------------------------------------------
Input:  What is the meaning of Love, to live in a more peaceful world? 

What concerns me is the psychological type of the Saviour.
This type might be depicted in the Gospels, in however mutilated a form and however much overladen with extraneous characters.
------------------------------------

# Question Generator

In [None]:
# Load the TOKENIZER
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer_2.eos_token

In [None]:
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load the fine-tuned model from local (a file pytorch_model.bin must be in the current directory)
model = AutoModelForCausalLM.from_pretrained("./")

# Snippet to TRUNCATE strings to QUESTION mark
def questionTruncate(s: str):
  """
  String polisher for outputting clean questions.
  Input: string s
  Returns: string s truncated at the FIRST "?" char or at the LAST "." char
  """
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  FirstQuestMark = s.find("?")
  if FirstQuestMark >= 0 :
    return s[: FirstQuestMark +1 ]
  elif LastFullStop >= 0 :
    return s[: len(s) - LastFullStop]
  return s

# Test
#for s in ['Multiple? more than one?', 'First? Then no question.', 'No punctuation', 'No question.']:
#  print(questionTruncate(s))


# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True, #####
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

In [15]:
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

Text: What does Nihilism mean? 

Bot: The world of the modern man is not always a good thing. What is the meaning of the word?--The term is a term which means the word itself. To put it simply, there is no better word than a German word for a German word for a German word. What is the meaning of a German word for the German word for the German word for 
 ----------------------------------------------------------------------------------------------------
Text: What is Pessimism 

Bot: What is the meaning of the word pessimism?In the present day, it is the concept of the word pessimism.The term pessimism is the expression of a desire to attain to an ideal.In order to understand why, why?Because pessimism is an expression of a desire to attain to an ideal, it is a necessary means of overcoming the prejudices of the 
 ----------------------------------------------------------------------------------------------------
Text: What is the purpose? 

Bot: This is what is called the meaning of t

# PhilosopherBot

In [16]:
"""
MAIN LOGIC IS AS FOLLOWS:
input -> topic -> relevant paragraph 
-> use last sentence of paragraph to generate question 
-> print(question, paragraph, summarization of previous two)
"""

def bot(text):
  paragraph = textToParagraph(text)
  summarized_paragraph = summary(paragraph)

  print(summarized_paragraph)

  lastSentence = split_in_sentences(summarized_paragraph)[-1]
  question = questionGenerator(lastSentence)
  question = questionTruncate(question)
  for sent in split_in_sentences(question):
    print(sent)
  return

In [None]:
testQuestion = []
text = 'Tell me about God and the meaning of Life'
for step in range(5):
  testQuestion.append(text)
  text = questionGenerator(text)

for text in testQuestion:
  print("Input text:\n" + text)
  print("Bot: \n")
  bot(text)
  print(100 * '-')

In [18]:
testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')
testQuestion.append('If children grow up in this World, how can we teach them how to live more sustainably?')

for text in testQuestion:
  print(100 * '-')  
  print(text + '\n')
  bot(text)


----------------------------------------------------------------------------------------------------
How much power should people have in european politics?

Can a crasser, more indolent, and more lounging form of Christian belief be imagined, than that of the average German Protestant?
When you think of it as a matter of fact, this is what it is called, and is a word for it.
----------------------------------------------------------------------------------------------------
Can religion save the evil in the world?

The Christian concept of a god is one of the most corrupt concepts that has ever been set up in the world. God degenerated into the contradiction of life.
A man is dying for the world to know, but what?
----------------------------------------------------------------------------------------------------
What is the meaning of Love, to live in a more peaceful world?

What concerns me is the psychological type of the Saviour. This type might be depicted in the Gospels, in howe