In [26]:
import os
import pandas as pd

#Gensim and nltk libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from pprint import pprint
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/omostrander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/omostrander/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
#print items in working directory
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/.DS_Store
./data/abcnews-date-text.csv


In [12]:
#Load the dataset from the csv and save it to 'data_text'
data = pd.read_csv('./data/abcnews-date-text.csv')

# we only need to headlines from the data
data_text = data[:300000][['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [13]:
#Get the total number of documents
print(len(documents), "\n")
documents.head()

300000 



Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


### Data Formatting
- Tokenization
  - Sentences -> words
  - Lowercase all words
  - Remove punctuation 
- Remove stopwords
- Lemmatization
  - 3rd person -> to 1st person
  - Verbs: past and future tenses converted to present tense
- Stem words - reducing them to their root forms
    

In [14]:
#Functions to perform the pre-processing steps on the entire dataset

stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [17]:
#preview document after pre-processing

document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))
print("\n", documents.head())

Original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


Tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

                                        headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


Let's now preprocess all the news headlines we have. To do that, let's use the map function from pandas to apply preprocess() to the headline_text column.

In [18]:
#preprocess all the headlines, saving the list of results as 'processed_docs'
processed_docs = documents['headline_text'].map(preprocess)

#preview processed docs
processed_docs.head()

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

### Bag of words on the dataset
Now let's create a dictionary from 'processed_docs' containing the number of times a word appears in the training set. To do that, let's pass processed_docs to gensim.corpora.Dictionary() and call it 'dictionary'.

In [19]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [20]:
#Verify the dictionary creation
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


### Gensim filter_extremes 
- filter_extremes(no_below=5, no_above=0.5, keep_n=100000) 
- Filter out tokens that appear in 
- less than no_below documents (absolute number) or
  more than no_above documents (fraction of total corpus size, not absolute number).
  after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).

In the example below, we will remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents


In [21]:
#Remove very rare and very common words
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

#### Gensim doc2bow

doc2bow(document)

- Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further    preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [22]:
#Convert document into the bag-of-words format
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [23]:
#Check Bag-of-Words corpus for our sample document --> (token_id, token_count)
bow_corpus[document_num]

[(154, 1), (228, 1), (276, 1), (563, 1), (806, 1), (3175, 1), (3176, 1)]

### Preview BOW for our sample preprocessed document

Here document_num is document number 4310 which we have checked in Step 2

In [24]:
#Preview BOW for our sample pre-processed document
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 154 ("govt") appears 1 time.
Word 228 ("group") appears 1 time.
Word 276 ("vote") appears 1 time.
Word 563 ("local") appears 1 time.
Word 806 ("want") appears 1 time.
Word 3175 ("compulsori") appears 1 time.
Word 3176 ("ratepay") appears 1 time.


#### TF-IDF on our document set
While performing TF-IDF on the corpus is not necessary for LDA implemention using the gensim model, it is recemmended. TF-IDF expects a bag-of-words (integer values) training corpus during initialization. During transformation, it will take a vector and return another vector of the same dimensionality.

- TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)
- IDF(w) = log_e(Total number of documents / Number of documents with term w in it).

In [28]:
#Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
tfidf = models.TfidfModel(bow_corpus)

#Apply transformation to the entire corpus and call it 'corpus_tfidf'
corpus_tfidf = tfidf[bow_corpus]

#Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5959919082495837),
 (1, 0.3920069955308767),
 (2, 0.48532280284497653),
 (3, 0.5055550788930631)]
