In [148]:
import re
from collections import Counter
from functools import reduce
from math import log10
from operator import or_
from typing import List, Dict, Iterable

import nltk
import pandas as pd
from IPython.display import display
from nltk.tokenize import word_tokenize

Download additional nltk components.

In [149]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/max-
[nltk_data]     omelchenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/max-
[nltk_data]     omelchenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Construct stopwords list.

In [150]:
language = 'english'
stopwords = nltk.corpus.stopwords.words(language)
stopwords.append('the')
stopwords = set(stopwords)

Preprocess articles.

In [151]:
file_paths = (
    'data/articles/20th.txt',        # Ken kesey one flew over the cuckoo's nest
    'data/articles/19th.txt',        # Oliver Twist Charles Dickens
    'data/articles/article1.txt',    # Types of Ecology    #article
    'data/articles/scientific.txt',  # Euler characteristic
    'data/articles/article2.txt',    # A Comprehensive Guide To Jenkins Pipeline  #otherArticle
)

articles = []
word = re.compile(r'[^A-Za-z]')

for file_path in file_paths:
    with open(file_path, 'r') as file:
        article = file.read()

        # Left only words in lower case.
        article = word.sub(' ', article).lower()
        articles.append(article)

Split articles texts into tokens (words).

In [152]:
tokenized_articles = list(map(lambda text: word_tokenize(text, language=language, preserve_line=True), articles))

Remove stopwords from tokenized texts.

In [153]:
def remove_stop_words(_tokenized_text: List[str]) -> List[str]:
    _text = _tokenized_text.copy()
    for token in _tokenized_text:
        if token in stopwords:
            _text.remove(token)

    return _text

In [154]:
tokenized_articles = list(map(remove_stop_words, tokenized_articles))

Create common word set of all words in all articles.

In [155]:
words = reduce(or_, map(set, tokenized_articles))

Create a mapping *word* : *how many times is used* for all articles.

In [156]:
words_count_zero = dict.fromkeys(words, 0)
words_count_articles = [words_count_zero.copy() for i in range(len(tokenized_articles))]
for _words_count_article, _tokenized_article in zip(words_count_articles, tokenized_articles):
    _words_count_article.update(Counter(_tokenized_article))

In [157]:
pd.DataFrame(words_count_articles)

Unnamed: 0,anybody,clean,behind,stuck,hours,depend,anxious,cedar,imprinted,cork,...,many,pausing,broom,attending,e,took,said,involved,yet,waiting
0,1,2,5,0,1,0,0,3,0,0,...,0,0,4,0,0,1,0,0,0,1
1,0,0,0,0,0,0,1,0,1,1,...,1,1,0,1,0,0,5,0,1,0
2,0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Calculate term frequency (TF).

In [158]:
def compute_tf(_words_count_article: Dict[str, int], _bow: List[str]) -> Dict[str, int]:
    _tf = {}
    bow_len = len(_bow)
    for _word, _count in _words_count_article.items():
        _tf[_word] = _count / bow_len
    return _tf

In [159]:
# for more precise results use TF-IDF Model

tf_bows = list(map(lambda zipped: compute_tf(*zipped), zip(words_count_articles, tokenized_articles)))


Calculate an Inverse Document Frequency (IDF).

IDF is a measure of how important a term is. We need the IDF value because computing just the TF alone is not sufficient to understand the importance of words.

In [160]:
def compute_idf(_all_words: Iterable[str], _words_counts: List[Dict[str, int]]) -> Dict[str, int]:
    _n = len(_words_counts)
    
    _idfs = dict.fromkeys(_all_words, 0)
    for _words_count in _words_counts:
        for _word, _count in _words_count.items():
            if _count > 0:
                _idfs[_word] += 1
    
    for _word, _val in _idfs.items():
        if _val != 0:
            _idfs[_word] = log10(_n / _val)
        
    return _idfs

In [161]:
idfs = compute_idf(words, words_count_articles)

Calculate TF-IDF score for each word.

Words with a higher score are more important, and those with a lower score are less important.

In [162]:
def compute_tfidf(_tf_bow: Dict[str, int], _idfs: Dict[str, int]) -> Dict[str, int]:
    _tf_idf = {}
    for _word, _val in _tf_bow.items():
        _tf_idf[_word] = _val * _idfs[_word]
    return _tf_idf

In [163]:
tf_idf_bows = list(map(lambda _tf_bow: compute_tfidf(_tf_bow, idfs), tf_bows))

Display the most valuable words for every article.

In [164]:
article_names = ('19th', '20th', 'article1', 'scientific', 'article2')
df = pd.DataFrame(tf_idf_bows, index=article_names)

for article_name in article_names:
    article_df = df.sort_values(by=article_name, axis=1, ascending=False)
    display(article_name, article_df)

'19th'

Unnamed: 0,like,black,got,around,boys,mop,hear,hold,room,still,...,main,held,purchasing,areas,diversity,drudge,solids,show,polyhedron,contents
19th,0.009446,0.007556,0.005667,0.005667,0.005038,0.005038,0.005038,0.004408,0.004408,0.004408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.001311,0.0,0.0,0.0,0.001311
article1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002796,0.002796,0.002796,0.002796,0.002796,0.0,0.0,0.0,0.0,0.0
scientific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003738,0.003738,0.003738,0.0
article2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'20th'

Unnamed: 0,oliver,surgeon,child,dear,said,workhouse,young,parish,twist,name,...,monday,equipment,connected,carried,finally,blend,furious,mouth,ground,waiting
19th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001889,0.001259,0.0,0.0,0.00063,0.00063,0.001259,0.00063,0.00063,0.00063
20th,0.011802,0.00918,0.00918,0.007868,0.006557,0.006557,0.005246,0.005246,0.003934,0.003934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
scientific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003738,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002262,0.0,0.0,0.0,0.0,0.0,0.0


'article1'

Unnamed: 0,wildlife,animals,wild,conservation,everyone,trash,ways,organization,zoos,trafficking,...,orange,popularity,figures,hand,survive,real,noise,extant,standing,waiting
19th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00063,0.0,0.0,0.000717,0.0,0.00063,0.00063,0.0,0.001259,0.00063
20th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001493,0.001311,0.0,0.0,0.001311,0.0,0.0
article1,0.027959,0.016775,0.011184,0.011184,0.008388,0.008388,0.008388,0.008388,0.008388,0.008388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
scientific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011213,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.004524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'scientific'

Unnamed: 0,euler,characteristic,figure,e,figures,holes,number,f,also,removing,...,responsibly,white,bays,rush,concepts,etc,risk,suppose,polished,waiting
19th,0.0,0.0,0.000359,0.0,0.0,0.0,0.000359,0.0,0.0,0.0,...,0.0,0.002151,0.00063,0.00063,0.0,0.0,0.0,0.0,0.001259,0.00063
20th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
scientific,0.044854,0.041116,0.019152,0.014951,0.011213,0.011213,0.01064,0.007476,0.007476,0.007476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002262,0.004524,0.002262,0.002262,0.0,0.0


'article2'

Unnamed: 0,jenkins,pipeline,continuous,delivery,software,jobs,build,workflow,understand,feature,...,extant,standing,native,sound,dear,doll,cleaning,passed,fingernails,waiting
19th,0.0,0.0,0.0,0.0,0.0,0.0,0.000359,0.0,0.0,0.0,...,0.0,0.001259,0.0,0.002519,0.0,0.00063,0.0,0.0,0.00063,0.00063
20th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001311,0.0,0.0,0.0,0.007868,0.0,0.0,0.001311,0.0,0.0
article1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002796,0.0,0.0,0.0,0.002796,0.0,0.0,0.0
scientific,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
article2,0.056551,0.052027,0.031669,0.024882,0.02262,0.015834,0.010303,0.009048,0.006786,0.006786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
