In [1]:
# Imports
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
import os
import itertools
from collections import defaultdict
from gensim.models.tfidfmodel import TfidfModel


#########################  


# Loading data

# article 
with open("assets/Wikipedia articles/wiki_text_debugging.txt") as file:
    article = file.read()

# english_stops
with open("assets/english_stopwords.txt") as file:
    english_stops = file.read().split('\n')

# articles
articles_str = []
for a in os.listdir("assets/Wikipedia articles"):
    with open(f"assets/Wikipedia articles/{a}") as file:
        articles_str.append(file.read())

articles = [
    [t for t in alphas if t not in english_stops] # Finally, keep non stops as articles
    for alphas in [
        [token for token in lowers if token.isalpha()] # keep alphanumerics as alphas
        for lowers in [
            word_tokenize(doc.lower()) for doc in articles_str # tokenize as lowers
        ]
    ]
]

In [2]:
# Tokenize the article: tokens
tokens = word_tokenize(article)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[(',', 151), ('the', 150), ('.', 89), ('of', 81), ("''", 66), ('to', 63), ('a', 60), ('``', 47), ('in', 44), ('and', 41)]


In [3]:
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[('debugging', 40), ('system', 25), ('bug', 17), ('software', 16), ('problem', 15), ('tool', 15), ('computer', 14), ('process', 13), ('term', 13), ('debugger', 13)]


In [4]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(doc) for doc in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

computer
[(9, 2), (17, 1), (18, 4), (19, 1), (20, 1), (26, 5), (31, 1), (34, 5), (35, 4), (39, 1)]


In [5]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

computer 77
ref 40
programming 39
program 35
software 33
computer 597
software 450
cite 322
ref 259
code 235


In [6]:
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

[(9, 0.01698915216676487), (17, 0.01698915216676487), (18, 0.02642178695988933), (19, 0.008494576083382435), (20, 0.0022343657729024537)]
edition 0.24234452193031106
abraham 0.1827164203106929
silberschatz 0.1827164203106929
last 0.1453198282793913
year 0.1453198282793913
