In [3]:
import requests
import pandas as pd
import spacy

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

In [5]:
# text is Walter Pater's "The Renaissance: Studies in Art and Poetry"
response = requests.get('https://www.gutenberg.org/cache/epub/2398/pg2398.txt') 
text = response.text

In [6]:
text[:300]

'\ufeffThe Project Gutenberg eBook of The Renaissance: Studies in Art and Poetry\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\no'

In [7]:
text.find('Many attempts have been made by writers on art and poetry to define')

1127

In [8]:
text.find('*** END OF THE PROJECT GUTENBERG EBOOK THE RENAISSANCE: STUDIES IN ART AND POETRY ***')

349683

In [9]:
start = 1127
end = 349683 -1

In [10]:
renaissance = text[start:end]

In [11]:
renaissance_paras = renaissance.split('\r\n\r\n')

In [12]:
author = []
title = []

In [13]:
for para in renaissance_paras:
    author.append('Walter Pater')
    title.append('The Renaissance: Studies in Art and Poetry')

In [14]:
renaissance_df = pd.DataFrame(list(zip(author, title, renaissance_paras)), columns=['author', 'title', 'text'])

In [15]:
renaissance_df.head()

Unnamed: 0,author,title,text
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""..."
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob..."
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti..."
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...


In [16]:
def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [17]:
renaissance_df['lemmas'] = renaissance_df['text'].apply(process_text)

In [18]:
renaissance_df.head()

Unnamed: 0,author,title,text,lemmas
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...,attempt writer art poetry define beauty abstra...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""...",object justly say aim true criticism aesthetic...
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob...",aesthetic critic regard object work art fair f...
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti...",important critic possess correct abstract defi...
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...,require great nicety disengage virtue commoner...


In [19]:
renaissance_df.to_csv('pater_dataframe.csv', index=False)

In [20]:
from collections import defaultdict
import wget
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [26]:
df = pd.read_csv('pater_dataframe_clean.csv')

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,author,title,text,lemmas
0,0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...,attempt writer art poetry define beauty abstra...
1,1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""...",object justly say aim true criticism aesthetic...
2,2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob...",aesthetic critic regard object work art fair f...
3,3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti...",important critic possess correct abstract defi...
4,4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...,require great nicety disengage virtue commoner...


In [28]:
documents = df['lemmas'].to_list()

In [29]:
type(documents)

list

In [30]:
texts =[
    [word for word in document.lower().split()]
    for document in documents
]

In [31]:
frequency = defaultdict(int)
for text in texts:
  for token in text:
    frequency[token] += 1

In [32]:
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [33]:
dictionary = corpora.Dictionary(texts)

In [34]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [35]:
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=50)

In [36]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis