In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [94]:
from glob import glob
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from collections import Counter, defaultdict
import itertools

In [28]:
# datasets used
with open('./wiki_articles/wiki_text_debugging.txt', 'r') as f:
    article = f.read()
    article_title = word_tokenize(article)[2]
with open('./english_stopwords.txt') as f:
    english_stops = f.read()
    
article_files = glob('./wiki_articles/*.txt')

# Word counts with bag-of-words

## Building a counter with bag-of-words

In [14]:
# tokenize article
tokens = word_tokenize(article)

# convert tokens into lower case
lower_tokens = [t.lower() for t in tokens]

# create a counter with the lowercase tokens
bow_simple = Counter(lower_tokens)

print(f'10 most common tokens: {bow_simple.most_common()[:10]}')

10 most common tokens: [(',', 151), ('the', 150), ('.', 89), ('of', 81), ("''", 66), ('to', 63), ('a', 60), ('``', 47), ('in', 44), ('and', 41)]


# Simple text preprocessing

## Text preprocessing practice

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loujo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [22]:
# retain alphabetic words from lower_tokens
alpha_only = [t for t in lower_tokens if t.isalpha()]

# remove all stop words
no_stops = [t for t in alpha_only if t not in english_stops]

# instantiate WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# lemmatize all tokens into a new list
lemmatized = [lemmatizer.lemmatize(t) for t in no_stops]

# create bag-of-words
bow = Counter(lemmatized)
print(f'10 most common tokens: {bow.most_common(10)}')

10 most common tokens: [('debugging', 40), ('system', 25), ('bug', 17), ('software', 16), ('problem', 15), ('tool', 15), ('computer', 14), ('process', 13), ('term', 13), ('debugger', 13)]


# Introduction to `gensim`

## Creating and querying a corpus with `gensim`

In [101]:
# let's do this preprocessing to a couple more messy articles from wikipedia
articles = []

for a in article_files:
    #load file
    with open(a, 'r', encoding='utf-8') as file:
        article = file.read()
    # tokenize words
    tokens = word_tokenize(article)
    # convert all to lower case
    lower_tokens = [t.lower() for t in tokens]
    # take away numeric characters
    alpha_only = [t for t in lower_tokens if t.isalpha()]
    # and stop words
    no_stops = [t for t in alpha_only if t not in english_stops]
    
    articles.append(no_stops)

In [102]:
# create gensim Dictionary from articles
dictionary = Dictionary(articles)

# select the id for "computer"
computer_id = dictionary.token2id.get('computer')

# use computer_id with the dictionary to print the word
print(f'ID {computer_id} is {dictionary.get(computer_id)}\n')

# create an MmCorpus
corpus = [dictionary.doc2bow(article) for article in articles]

# first 10 word ids with their freq counts from the fifth document
print(f'first 10 word ids from article 5: {corpus[4][:10]}')

ID 242 is computer

first 10 word ids from article 5: [(1, 1), (13, 1), (15, 1), (18, 1), (26, 1), (29, 1), (37, 1), (38, 4), (47, 2), (48, 7)]


## `gensim` bag-of-words

In [131]:
# save the fifth article
doc = corpus[4]

# sort the doc for frequency
bow_doc = sorted(doc, 
                 key=lambda w: w[1], 
                 reverse=True)

# print the top 5 words of the doc
print('top 5 words of article 5:')
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)

top 5 words of article 5:
debugging 40
system 19
software 16
tools 14
computer 12


In [132]:
# create a defaultdict
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# create a sorted list
sorted_word_count = sorted(total_word_count.items(),
                           key=lambda w: w[1],
                           reverse=True)

# top 5 words
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

computer 598
software 450
cite 322
ref 259
code 235


# Tf-idf with `gensim`

## Tf-idf with Wikipedia

In [133]:
# create a new TfidfModel using gensim's
tfidf = TfidfModel(corpus)

# calculate the tfidf weights of doc
tfidf_weights = tfidf[doc]

# first five weights
print(f'first 5 weights:\n{tfidf_weights[:5]}\n')

# sort
sorted_weights = sorted(tfidf_weights,
                        key=lambda w: w[1],
                        reverse=True)

print('top 5 weights:')
for term_id, weight in sorted_weights[:5]:
    print(dictionary.get(term_id), weight)

first 5 weights:
[(1, 0.012414154511302825), (13, 0.015679504267112274), (15, 0.019675969378573348), (18, 0.012414154511302825), (26, 0.019675969378573348)]

top 5 weights:
wolf 0.222521392005895
debugging 0.20609358576129203
fence 0.178017113604716
debugger 0.13655569962433106
squeeze 0.13351283520353702
