In [1]:
import requests
import pandas as pd
import spacy

In [17]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=d806445cbef0104eea4b477595d54bf6fbed8c5bc1ff888ceae74a8d3e068311
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

In [3]:
# text is Walter Pater's "The Renaissance: Studies in Art and Poetry"
response = requests.get('https://www.gutenberg.org/cache/epub/2398/pg2398.txt')
text = response.text

In [4]:
text[:300]

'\ufeffThe Project Gutenberg eBook of The Renaissance: Studies in Art and Poetry\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\no'

In [5]:
text.find('Many attempts have been made by writers on art and poetry to define')

1127

In [6]:
text.find('*** END OF THE PROJECT GUTENBERG EBOOK THE RENAISSANCE: STUDIES IN ART AND POETRY ***')

349683

In [7]:
start = 1127
end = 349683 -1

In [8]:
renaissance = text[start:end]

In [9]:
renaissance_paras = renaissance.split('\r\n\r\n')

In [10]:
author = []
title = []

In [11]:
for para in renaissance_paras:
    author.append('Walter Pater')
    title.append('The Renaissance: Studies in Art and Poetry')

In [12]:
renaissance_df = pd.DataFrame(list(zip(author, title, renaissance_paras)), columns=['author', 'title', 'text'])

In [13]:
renaissance_df.head()

Unnamed: 0,author,title,text
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""..."
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob..."
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti..."
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...


In [14]:
def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [15]:
renaissance_df['lemmas'] = renaissance_df['text'].apply(process_text)

In [None]:
renaissance_df.head()

Unnamed: 0,author,title,text,lemmas
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...,attempt writer art poetry define beauty abstra...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""...",object justly say aim true criticism aesthetic...
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob...",aesthetic critic regard object work art fair f...
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti...",important critic possess correct abstract defi...
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...,require great nicety disengage virtue commoner...


In [None]:
renaissance_df.to_csv('pater_dataframe.csv', index=False)

In [None]:
!pip install pyLDAvis

In [20]:
from collections import defaultdict
import wget
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [21]:
df = pd.read_csv('/content/pater_dataframe.csv')

In [33]:
df

Unnamed: 0,author,title,text,lemmas
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...,attempt writer art poetry define beauty abstra...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""...",object justly say aim true criticism aesthetic...
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob...",aesthetic critic regard object work art fair f...
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti...",important critic possess correct abstract defi...
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...,require great nicety disengage virtue commoner...
...,...,...,...,...
278,Walter Pater,The Renaissance: Studies in Art and Poetry,,
279,Walter Pater,The Renaissance: Studies in Art and Poetry,,
280,Walter Pater,The Renaissance: Studies in Art and Poetry,,
281,Walter Pater,The Renaissance: Studies in Art and Poetry,,


In [36]:
documents = df['lemmas'].to_list()

In [31]:
#jd
#some of these are 'float' ...not sure why
floats= []
for d in documents:
  if type(d)== float:
    floats.append(d)

In [32]:
floats #it looks like you just have some null values in your dataframe
#This is just because some of the requests you made might not have any data associated with them
#so pandas added a Null (aka NaN) value
#when you try to then split the NaN value like you would a string, it throws an error

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

In [34]:
#thankfully pandas has super easy built in function to remove null values from your data frame!
#this is why data cleaning is so important but also annoyingly tedious at times- messy data can absolutely cause errors like this!

clean = df.dropna()
clean

Unnamed: 0,author,title,text,lemmas
0,Walter Pater,The Renaissance: Studies in Art and Poetry,Many attempts have been made by writers on art...,attempt writer art poetry define beauty abstra...
1,Walter Pater,The Renaissance: Studies in Art and Poetry,"""To see the object as in itself it really is,""...",object justly say aim true criticism aesthetic...
2,Walter Pater,The Renaissance: Studies in Art and Poetry,"The aesthetic critic, then, regards all the ob...",aesthetic critic regard object work art fair f...
3,Walter Pater,The Renaissance: Studies in Art and Poetry,"What is important, then, is not that the criti...",important critic possess correct abstract defi...
4,Walter Pater,The Renaissance: Studies in Art and Poetry,Often it will require great nicety to disengag...,require great nicety disengage virtue commoner...
...,...,...,...,...
273,Walter Pater,The Renaissance: Studies in Art and Poetry,\r\nTo regard all things and principles of thi...,regard thing principle thing inconstant mode f...
274,Walter Pater,The Renaissance: Studies in Art and Poetry,Or if we begin with the inward world of though...,begin inward world thought feeling whirlpool r...
275,Walter Pater,The Renaissance: Studies in Art and Poetry,"Philosophiren, says Novalis, ist dephlegmatisi...",philosophiren say novalis ist dephlegmatisiren...
276,Walter Pater,The Renaissance: Studies in Art and Poetry,"To burn always with this hard, gemlike flame, ...",burn hard gemlike flame maintain ecstasy succe...


In [35]:
clean.to_csv('pater_dataframe_clean.csv')

In [37]:
texts =[
    [word for word in document.lower().split()]
    for document in documents
]