### Import packages


In [190]:
import json
import os
import string
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import nltk
import numpy as np
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


### Create documents corpus

In [191]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt')
files_names = corpus.fileids()
files_names

  corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt')


['Chronicles of Narnia. Prince Caspian.txt',
 'Chronicles of Narnia. The Horse and His Boy.txt',
 'Chronicles of Narnia. The Last Battle.txt',
 'Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt',
 'Chronicles of Narnia. The Magicians Nephew.txt',
 'Chronicles of Narnia. The Silver Chair.txt',
 'Chronicles of Narnia. The Voyage of the Dawn Treader.txt',
 'Fantastic Beasts and Where to Find Them.txt',
 'Fantastic Beasts. The Crimes of Grindelwald.txt',
 'Fantastic Beasts. The Secrets of Dumbledore.txt',
 'Harry Potter and the Chamber of Secrets.txt',
 'Harry Potter and the Deathly Hallows Part 1.txt',
 'Harry Potter and the Deathly Hallows Part 2.txt',
 'Harry Potter and the Goblet of Fire.txt',
 'Harry Potter and the Half-Blood Prince.txt',
 'Harry Potter and the Order of the Phoenix.txt',
 'Harry Potter and the Philosophers Stone.txt',
 'Harry Potter and the Prisoner of Azkaban.txt',
 'Twilight Saga. Breaking Dawn Part 1.txt',
 'Twilight Saga. Breaking Dawn Part 2.txt',
 

### Corpus documents preprocessing

In [192]:
documents = {}
for file_name in files_names:
    documents[file_name] = corpus.raw(file_name)
#print(json.dumps(documents, indent=4, ensure_ascii=False))

In [193]:
length = {}
for file_name in documents:
    length[file_name] = {
        "pre": len(word_tokenize(documents[file_name])),
    }
#print(json.dumps(length, indent=4, ensure_ascii=False))

In [194]:
ps = PorterStemmer()

In [195]:
length_post = {}
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.punctuation])
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.digits])
    documents[file_name] = " ".join([ps.stem(word) for word in word_tokenize(documents[file_name])])
    documents[file_name] = " ".join(word for word in word_tokenize(documents[file_name]) if word not in stopwords.words('english'))
#print(json.dumps(documents, indent=4, ensure_ascii=False))


In [196]:
for file_name in documents:
    length[file_name]["post"] = len(word_tokenize(documents[file_name]))
#print(json.dumps(length, indent=4, ensure_ascii=False))

In [197]:
lengths = pd.DataFrame.from_dict(length, orient='index')


In [198]:
lengths['diff'] = lengths['pre'] - lengths['post']
lengths['pct'] = lengths['diff'] / lengths['pre']
#lengths


### Create frequency matrix

In [199]:
docs = pd.DataFrame.from_dict(documents, orient='index')
docs.columns = ['content']
#docs


In [200]:
tv = TfidfVectorizer()
cv = CountVectorizer()
matrix_tfidf = tv.fit_transform(docs['content'])
matrix_tfidf
matrix_tf = cv.fit_transform(docs['content'])
matrix_tf

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6019 stored elements and shape (23, 2503)>

In [201]:
sparcity_tfidf = 1-(matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparcity_tfidf

0.8954472024874498

### Directories for results

In [202]:
try: 
    if not os.path.exists('./worldclouds'):
        os.mkdir('./wordclouds')
    if not os.path.exists('./topic_modeling'):
        os.mkdir('./topic_modeling')
    if not os.path.exists('./topic_modeling/topics'):
        os.mkdir('./topic_modeling/topics')
    if not os.path.exists('./topic_modeling/documents'):
        os.mkdir('./topic_modeling/documents')
    if not os.path.exists('./clustering'):
        os.mkdir('./clustering')
    if not os.path.exists('./ngrams'):
        os.mkdir('./ngrams')
except Exception as e:
    print(f"Error creating directories: {e}")

Error creating directories: [Errno 17] File exists: './wordclouds'


### Worldclouds

In [203]:
wordcloud = WordCloud(
    background_color ='white',
    max_words = 5000,
    contour_width =3,
    contour_color ='steelblue'
)

In [204]:
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(index.replace(".txt",""))
    plt.savefig('./wordclouds/{}'.format(index.replace("txt","png")))
    plt.close()

### Topic Modeling

In [205]:
def plot_top_words(model, feature_names, n_top_words, title, size):
    colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "gray", "olive", "cyan"]
    fig, axes = plt.subplots(*size, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color=colors[topic_idx])
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(f'./topic_modeling/topics/{title}.png')
    plt.close()

In [206]:
def plot_documents(model, matrix, n_topics, title):
    colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "gray", "olive", "cyan"]
    docs_topics = pd.DataFrame(model.transform(matrix), columns=[f'Topic {x}' for x in range(n_topics)])
    docs_topics.index = [file_name.replace(".txt","") for file_name in files_names]
    plt.figure(figsize=(7, 4))
    left = [0] * len(docs_topics)
    for i, col in enumerate(docs_topics.columns):
        plt.barh(docs_topics.index, docs_topics[col], left=left, label=col, color=colors[i])
        left = [left[j] + docs_topics[col].iloc[j] for j in range(len(docs_topics))]
    plt.savefig(f'./topic_modeling/documents/{title}.png')
    plt.close()

In [207]:
n_topics = 10
features_names = cv.get_feature_names_out()
n_top_words = 20
size = (2, 5)

In [208]:
lda = LatentDirichletAllocation(n_components=n_topics, max_iter = 5, learning_method='online', learning_offset=50, random_state=0)
lda.fit(matrix_tf)
plot_top_words(lda, features_names, n_top_words, 'LDA Topics', size)
plot_documents(lda, matrix_tf, n_topics, 'LDA Document Topics')

### Clustering

In [209]:
nmf_fn = NMF(
    n_components=n_topics, 
    random_state=1, 
    alpha_H=0.00005, 
    alpha_W=0.00005, 
    l1_ratio=0.5
    )
nmf_fn.fit(matrix_tfidf)
plot_top_words(nmf_fn, features_names, n_top_words, 'NMF Topics (FN)', size)
plot_documents(nmf_fn, matrix_tfidf, n_topics, 'NMF Document Topics (FN)')

In [210]:
nmf_kl = NMF(
    n_components=n_topics, 
    random_state=1, 
    alpha_H=0.00005, 
    alpha_W=0.00005, 
    l1_ratio=0.5,
    beta_loss='kullback-leibler',
    solver='mu',
    max_iter=1000
)
nmf_kl.fit(matrix_tfidf)
plot_top_words(nmf_kl, features_names, n_top_words, 'NMF Topics (KL)', size)
plot_documents(nmf_kl, matrix_tfidf, n_topics, 'NMF Document Topics (KL)')

In [211]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    dendrogram(linkage_matrix, **kwargs)

In [212]:
cs = cosine_similarity(matrix_tfidf, matrix_tfidf).flatten().reshape(matrix_tfidf.shape[0], matrix_tfidf.shape[0])

In [213]:
clustering_cs_ward = AgglomerativeClustering(
    n_clusters=6,
    metric='euclidean',
    linkage='ward',
    compute_distances=True
)

clustering_cs_ward.fit(cs)
print(clustering_cs_ward.labels_)
plot_dendrogram(clustering_cs_ward, labels=files_names, truncate_mode='level', orientation='right')
plt.savefig('./clustering/cs_ward.png')
plt.close()

[0 5 0 0 4 0 0 3 3 3 1 1 1 1 1 1 1 1 2 2 2 2 2]


In [214]:
ed = euclidean_distances(matrix_tf, matrix_tf).flatten().reshape(matrix_tf.shape[0], matrix_tf.shape[0])

In [215]:
clustering_ed_complete = AgglomerativeClustering(
    n_clusters=6,
    metric='precomputed',
    linkage='complete',
    compute_distances=True
)   
clustering_ed_complete.fit(ed)
print(clustering_ed_complete.labels_)
plot_dendrogram(clustering_ed_complete, labels=files_names, truncate_mode='level', orientation='right')
plt.savefig('./clustering/ed_complete.png')
plt.close()

[0 0 0 0 3 4 0 1 1 1 2 2 2 2 2 2 2 2 5 5 5 5 5]


### N-grams

In [216]:
documents_tokenized = {}
for key in documents:
    documents_tokenized[key] = word_tokenize(documents[key], language='english')
print(json.dumps(documents_tokenized, indent=4, ensure_ascii=False))

{
    "Chronicles of Narnia. Prince Caspian.txt": [
        "peter",
        "susan",
        "edmund",
        "luci",
        "pevensi",
        "magic",
        "whisk",
        "away",
        "british",
        "railway",
        "station",
        "beach",
        "near",
        "old",
        "ruin",
        "castl",
        "determin",
        "ruin",
        "cair",
        "paravel",
        "onc",
        "rule",
        "king",
        "queen",
        "narnia",
        "discov",
        "treasur",
        "vault",
        "peter",
        "sword",
        "shield",
        "susan",
        "bow",
        "arrow",
        "luci",
        "dagger",
        "bottl",
        "magic",
        "cordial",
        "store",
        "susan",
        "horn",
        "summon",
        "help",
        "miss",
        "left",
        "wood",
        "day",
        "return",
        "england",
        "prior",
        "visit",
        "narnia",
        "although",
        "onli",
      

In [217]:
n = 3
tokens = 5
for title in documents_tokenized:
    n_gram = pd.Series(ngrams(documents_tokenized[title], n)).value_counts()
    n_gram[:tokens].plot.barh()
plt.savefig(f'./ngrams/{n}_{title}.png')
plt.close()