Copyright 2020 Almintas Povilaitis

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

<table class="tfo-notebook-buttons" align="left">
<td>
<a target="_blank"  href="https://colab.research.google.com/github/mlai-demo/NLP_Russell/blob/master/RussellPub.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
</td><td>
<a target="_blank"  href="https://github.com/mlai-demo/NLP_Russell/blob/master/RussellPub.ipynb"><img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td></table>

In [None]:
#adjusts the notebook look on the screen
#no need to run in Colab which does a good job adjusting the notebook window

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 95%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [None]:
#set up Gutenberg package
!sudo apt-get install libdb++-dev
!export BERKELEYDB_DIR=/usr
!pip install gutenberg

In [None]:
#add own stop words to those of Scikit-learn
import sklearn.feature_extraction.text as text

new_stop_words = ['one','came', 'come', 'upon', 'made','though', 'indeed', 'yet', 'without'
                 'thus','therefore', 'another', 'much', 'many', 'either', 'upon', 'would',
                     'around', 'without', 'when', 'also', 'could', 'say', 'sent', 'notwithstanding', 'hence', 'thus',
                     'bertrand', 'russell']
my_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

In [None]:
#import texts from Gutenberg
import os
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

path = os.getcwd()
text_list = [5827, 690, 2529, 25447, 4776, 44932, 37090, 17350, 55610, 52091]

#write all into one file in the TextsPub directory
os.mkdir(path + '/TextsPub')
with open(path + '/TextsPub/Russell.txt', 'w') as f:
    for text in text_list:
        text = strip_headers(load_etext(text)).strip()
        f.write(text)
        
#write texts into separate files in the TextsPub directory
for text in text_list:
    with open(f"{path+'/TextsPub'}/{text}", "w") as f:
        f.write(strip_headers(load_etext(text)).strip())

In [None]:
#tonekize the text and plot by word frequency

#import os #use only if cell above is not run first
import string
import re
import nltk
nltk.download('punkt') #if using nltk for the first time or using Colab
nltk.download('stopwords') #if using nltk for the first time or using Colab
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline

path = os.getcwd()

no_short = re.compile(r'\W*\b\w{1,2}\b')
with open(path + '/TextsPub/Russell.txt') as f, open(path + '/TextsPub/Russell_tokens.txt', 'w') as out_f:
    text = f.read()
    for line in f:                                            #remove xtra empty lines
        if not line.strip(): continue  # skip the empty line
        outfile.write(line)
    text = no_short.sub('', text)
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = my_stop_words
    words = [w for w in words if not w in my_stop_words]
    new_text = ' '.join(words)
    plt.figure(figsize=(18, 9))
    fd = nltk.FreqDist(words)
    fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
    #print(new_text[:500])
    out_f.write(new_text)

In [None]:
#count all and unique words

unique = set(words)
print("The text is {} words long and {} unique words".format(len(words), len(unique)))

In [None]:
#lemmatize text

nltk.download('wordnet') #if using nltk for the first time or using Colab
from nltk.stem import WordNetLemmatizer

with open(path + '/TextsPub/Russell_tokens.txt') as f, open(path + '/TextsPub/Russell_lemma.txt', 'w') as out_f:
    text = f.read()
    tokens = word_tokenize(text)
    lemma = WordNetLemmatizer()
    lemmed = [lemma.lemmatize(word) for word in tokens]
    #print(lemmed[:100])
    new_lem_text = ' '.join(lemmed)
    out_f.write(new_lem_text)
    
unique_lem = set(lemmed)
print("The lemmatized text is {} words long and {} unique words".format(len(lemmed), len(unique_lem)))

In [None]:
#draw word cloud

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt  

stemmed_text = open('TextsPub/Russell_lemma.txt').read()

wordcloud = WordCloud(stopwords=my_stop_words,
                      max_font_size=400,
                      width=2500,
                      height=2000,
                      random_state=64,
                     ).generate(stemmed_text)
fig = plt.figure().set_size_inches(16, 16)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title('Bertrand Russell Wordcloud', fontsize= 20)
plt.xlabel("size", fontsize= 12)
plt.show()

In [None]:
#count lemmatized words and sort in decreasing order

lemmed_text = open('TextsPub/Russell_lemma.txt').read()

BagLems = {}
for w in lemmed_text.split(" "):
    if w in BagLems:
        BagLems[w]+=1
    else:
        BagLems[w]=1
lemmed_data = sorted(BagLems.items(), key=lambda pair: pair[1], reverse=True)
print(lemmed_data[:40])

In [None]:
#rename files and move into a new directory

import shutil

os.mkdir('RenamedPub')

old_txt_dir = path + "/TextsPub"
new_txt_dir = path + "/RenamedPub"

file01 = shutil.copy(old_txt_dir + '/5827', new_txt_dir + '/Problems_Philosophy.txt')
file02 = shutil.copy(old_txt_dir + '/690', new_txt_dir + '/Roads_Freedom.txt')
file03 = shutil.copy(old_txt_dir + '/2529', new_txt_dir + '/Analysis_Mind.txt')
file04 = shutil.copy(old_txt_dir + '/25447', new_txt_dir + '/Mysticism_Logic.txt')
file05 = shutil.copy(old_txt_dir + '/4776', new_txt_dir + '/Political_Ideals.txt')
file06 = shutil.copy(old_txt_dir + '/44932', new_txt_dir + '/Free_Thought.txt')
file07 = shutil.copy(old_txt_dir + '/37090', new_txt_dir + '/Knowledge.txt')
file08 = shutil.copy(old_txt_dir + '/17350', new_txt_dir + '/Bolshevism.txt')
file09 = shutil.copy(old_txt_dir + '/55610', new_txt_dir + '/Why_Fight.txt')
file10 = shutil.copy(old_txt_dir + '/52091', new_txt_dir + '/Foundations_Geometry.txt')

In [None]:
#count number of texts

texts = sorted([os.path.join(new_txt_dir, fn) for fn in os.listdir(new_txt_dir)])
text_number = len(texts); text_number

In [None]:
#list text titles in alphabetical order

text_titles = []

for fn in texts:
    basename = os.path.basename(fn)
    title, ext = os.path.splitext(basename)
    text_titles.append(title)

titles = sorted(set(text_titles))
titles

In [None]:
#vectorize the text, create sparse and dense (numpy array) text matrices (tm), and the vocabulary

import numpy as np  
import sklearn.feature_extraction.text as text

vectorizer = text.CountVectorizer(input='filename', stop_words=my_stop_words, min_df=text_number)
tm_sparse = vectorizer.fit_transform(texts)
tm_array = vectorizer.fit_transform(texts).toarray()
vocab = np.array(vectorizer.get_feature_names())

In [None]:
tm_array

In [None]:
tm_array.shape

In [None]:
#run non-negative matrix factorization

from sklearn import decomposition

num_topics = 10

deco_nmf = decomposition.NMF(n_components=num_topics, random_state=4, max_iter=1000, alpha=0.5)

In [None]:
deco_nmf

In [None]:
from time import time

t0 = time()
text_topic_nmf = deco_nmf.fit_transform(dtm_sparse)
print("Done in %0.3fs." % (time() - t0))

In [None]:
num_top_words = 10
topic_words_nmf = []

for topic in deco_nmf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words_nmf.append([vocab[i] for i in word_idx])

In [None]:
#list ten topics

for t in range(len(topic_words_nmf)):
    print("NMF Topic {}: {}".format(t, ' '.join(topic_words_nmf[t][:10])))

In [None]:
#top four topics sorted by text

for i in range(len(text_topic_nmf)):
    top_topics = np.argsort(text_topic_nmf[i,:])[::-1][0:4]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(titles[i], top_topics_str))

In [None]:
#all topics sorted by text

for i in range(len(text_topic_nmf)):
    top_topics = np.argsort(text_topic_nmf[i,:])[::-1]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(titles[i], top_topics_str))

In [None]:
deco_spca = decomposition.SparsePCA(n_components=num_topics, normalize_components='deprecated', random_state=4)
deco_spca

In [None]:
#run sparse principal compenent analysis

from time import time

t0 = time()
text_topic_spca = deco_spca.fit_transform(tm_array)
print("Done in %0.3fs." % (time() - t0))

In [None]:
topic_words = []

for topic in deco_spca.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

for t in range(len(topic_words)):
    print("SPCA Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

In [None]:
for i in range(len(text_topic_spca)):
    top_topics = np.argsort(text_topic_spca[i,:])[::-1][0:4]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(titles[i], top_topics_str))         

In [None]:
deco_lda = decomposition.LatentDirichletAllocation(n_components=num_topics, 
                                                  batch_size = 64,
                                                  max_iter=25,
                                                  learning_method='online',
                                                  learning_offset=1.,
                                                  random_state=4)

                                
deco_lda

In [None]:
#run latent dirichlet allocation

t0 = time()
text_topic_lda = deco_lda.fit_transform(tm_array)
print("Done in %0.3fs." % (time() - t0))

In [None]:
topic_words_lda = []

for topic in deco_lda.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words_lda.append([vocab[i] for i in word_idx])

for t in range(len(topic_words_lda)):
    print("LDA Topic {}: {}".format(t, ' '.join(topic_words_lda[t][:10])))

In [None]:
for i in range(len(text_topic_lda)):
    top_topics = np.argsort(text_topic_lda[i,:])[::-1][0:4]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(titles[i], top_topics_str))

In [None]:
deco_tsvd = decomposition.TruncatedSVD(n_components=num_topics, n_iter=50, random_state=4)
deco_tsvd

In [None]:
#run truncated singular value decomposition (latent semantic analysis)

t0 = time()
text_topic_tsvd = deco_tsvd.fit_transform(tm_sparse)
print("Done in %0.3fs." % (time() - t0))

In [None]:
topic_words_tsvd = []

for topic in deco_tsvd.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words_tsvd.append([vocab[i] for i in word_idx])

for t in range(len(topic_words_tsvd)):
    print("TSVD Topic {}: {}".format(t, ' '.join(topic_words_tsvd[t][:10])))

In [None]:
for i in range(len(text_topic_tsvd)):
    top_topics = np.argsort(text_topic_tsvd[i,:])[::-1][0:4]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(titles[i], top_topics_str))

In [None]:
#run term frequency - inverse document frequency and create pariwise similarity
#matrix among the ten texts

from sklearn.feature_extraction.text import TfidfVectorizer

documents = [open(f).read() for f in texts]
tfidf = TfidfVectorizer(stop_words=my_stop_words).fit_transform(documents)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity_matrix = pairwise_similarity.todense()
pairwise_similarity_matrix

In [None]:
#convert pairwise matrix into a dataframe for easier viewing

import pandas as pd

psm_df = pd.DataFrame(pairwise_similarity_matrix, index = titles, columns = titles).round(3)
psm_df

In [None]:
#uncomment below if want to save the dataframe in a csv file
#psm_df.to_csv('pairwiae_df.csv')