### Import packages


In [59]:
import json
import os
import string
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


### Create documents corpus

In [60]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt')
files_names = corpus.fileids()
files_names

  corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt')


['Chronicles of Narnia. Prince Caspian.txt',
 'Chronicles of Narnia. The Horse and His Boy.txt',
 'Chronicles of Narnia. The Last Battle.txt',
 'Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt',
 'Chronicles of Narnia. The Magicians Nephew.txt',
 'Chronicles of Narnia. The Silver Chair.txt',
 'Chronicles of Narnia. The Voyage of the Dawn Treader.txt',
 'Fantastic Beasts and Where to Find Them.txt',
 'Fantastic Beasts. The Crimes of Grindelwald.txt',
 'Fantastic Beasts. The Secrets of Dumbledore.txt',
 'Harry Potter and the Chamber of Secrets.txt',
 'Harry Potter and the Deathly Hallows Part 1.txt',
 'Harry Potter and the Deathly Hallows Part 2.txt',
 'Harry Potter and the Goblet of Fire.txt',
 'Harry Potter and the Half-Blood Prince.txt',
 'Harry Potter and the Order of the Phoenix.txt',
 'Harry Potter and the Philosophers Stone.txt',
 'Harry Potter and the Prisoner of Azkaban.txt',
 'Twilight Saga. Breaking Dawn Part 1.txt',
 'Twilight Saga. Breaking Dawn Part 2.txt',
 

### Corpus documents preprocessing

In [61]:
documents = {}
for file_name in files_names:
    documents[file_name] = corpus.raw(file_name)
#print(json.dumps(documents, indent=4, ensure_ascii=False))

In [62]:
length = {}
for file_name in documents:
    length[file_name] = {
        "pre": len(word_tokenize(documents[file_name])),
    }
#print(json.dumps(length, indent=4, ensure_ascii=False))

In [63]:
ps = PorterStemmer()

In [64]:
length_post = {}
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.punctuation])
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.digits])
    documents[file_name] = " ".join([ps.stem(word) for word in word_tokenize(documents[file_name])])
    documents[file_name] = " ".join(word for word in word_tokenize(documents[file_name]) if word not in stopwords.words('english'))
#print(json.dumps(documents, indent=4, ensure_ascii=False))


In [65]:
for file_name in documents:
    length[file_name]["post"] = len(word_tokenize(documents[file_name]))
#print(json.dumps(length, indent=4, ensure_ascii=False))

In [66]:
lengths = pd.DataFrame.from_dict(length, orient='index')


In [67]:
lengths['diff'] = lengths['pre'] - lengths['post']
lengths['pct'] = lengths['diff'] / lengths['pre']
lengths


Unnamed: 0,pre,post,diff,pct
Chronicles of Narnia. Prince Caspian.txt,657,339,318,0.484018
Chronicles of Narnia. The Horse and His Boy.txt,850,448,402,0.472941
Chronicles of Narnia. The Last Battle.txt,1101,562,539,0.489555
"Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt",793,389,404,0.509458
Chronicles of Narnia. The Magicians Nephew.txt,1250,622,628,0.5024
Chronicles of Narnia. The Silver Chair.txt,1275,620,655,0.513725
Chronicles of Narnia. The Voyage of the Dawn Treader.txt,1203,595,608,0.505403
Fantastic Beasts and Where to Find Them.txt,765,416,349,0.456209
Fantastic Beasts. The Crimes of Grindelwald.txt,761,440,321,0.421813
Fantastic Beasts. The Secrets of Dumbledore.txt,635,360,275,0.433071


### Create frequency matrix

In [68]:
docs = pd.DataFrame.from_dict(documents, orient='index')
docs.columns = ['content']
docs


Unnamed: 0,content
Chronicles of Narnia. Prince Caspian.txt,peter susan edmund luci pevensi magic whisk aw...
Chronicles of Narnia. The Horse and His Boy.txt,boy name shasta ha live life rememb southern p...
Chronicles of Narnia. The Last Battle.txt,western region narnia clever greedi ape shift ...
"Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt",peter susan edmund luci pevensi evacu london e...
Chronicles of Narnia. The Magicians Nephew.txt,stori begin london dure summer two children di...
Chronicles of Narnia. The Silver Chair.txt,eustac scrubb reform charact follow event voya...
Chronicles of Narnia. The Voyage of the Dawn Treader.txt,two youngest pevensi children luci edmund stay...
Fantastic Beasts and Where to Find Them.txt,british wizard magizoologist newton newt scama...
Fantastic Beasts. The Crimes of Grindelwald.txt,magic congress unit state america macusa trans...
Fantastic Beasts. The Secrets of Dumbledore.txt,albu dumbledor gellert grindelwald briefli mee...


In [69]:
tv = TfidfVectorizer()
cv = CountVectorizer()
matrix_tfidf = tv.fit_transform(docs['content'])
matrix_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6019 stored elements and shape (23, 2503)>

In [70]:
sparcity_tfidf = 1-(matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparcity_tfidf

0.8954472024874498

### Directories for results

In [71]:
if not os.path.exists('./worldclouds'):
    os.mkdir('./wordclouds')
if not os.path.exists('./topic_modeling'):
    os.mkdir('./topic_modeling')
if not os.path.exists('./topic_modeling/topics'):
    os.mkdir('./topic_modeling/topics')
if not os.path.exists('./topic_modeling/documents'):
    os.mkdir('./topic_modeling/documents')
if not os.path.exists('./clustering'):
    os.mkdir('./clustering')
if not os.path.exists('./ngrams'):
    os.mkdir('./ngrams')
    

### Worldclouds

In [72]:
wordcloud = WordCloud(
    background_color ='white',
    max_words = 5000,
    contour_width =3,
    contour_color ='steelblue'
)

In [73]:
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(index.replace(".txt",""))
    plt.savefig('./wordclouds/{}'.format(index.replace("txt","png")))
    plt.close()

### Topic Modeling

In [74]:
def plot_top_words(model, feature_names, n_top_words, title, size):
    colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "gray", "olive", "cyan"]
    fig, axes = plt.subplots(*size, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color=colors[topic_idx])
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(f'./topic_modeling/topics/{title}.png')
    plt.close()

In [75]:
def plot_documents(model, matrix, n_topics, title):
    colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "gray", "olive", "cyan"]
    docs_topics = pd.DataFrame(model.transform(matrix), columns=[f'Topic {x}' for x in range(n_topics)])
    docs_topics.index = [file_name.replace(".txt","") for file_name in files_names]
    plt.figure(figsize=(7, 4))
    left = [0] * len(docs_topics)
    for i, col in enumerate(docs_topics.columns):
        plt.barh(docs_topics.index, docs_topics[col], left=left, label=col, color=colors[i])
        left = [left[j] + docs_topics[col].iloc[j] for j in range(len(docs_topics))]
    plt.savefig(f'./topic_modeling/documents/{title}.png')
    plt.close()

In [76]:
n_topics = 10
features_names = cv.get_feature_names_out()
n_top_words = 20
size = (2, 5)

NotFittedError: Vocabulary not fitted or provided

### Clustering

### N-grams