In [3]:
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [4]:
print(len(documents))
print(documents[:5])

1103663
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


We have to perform the following steps:
1. Tokenization- Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
2. Words that have fewer than 3 characters are removed
3. All stopwords are removed ("a", "the", "an")
4. Words are lemmatized- words in third person are changed to first person and verbs in past and future tenses are changed into present tense
5. Words are stemmed - words are reduced to their root form

In [7]:
# Import libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)


In [8]:
import nltk

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/nirmalb/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
# Function to lemmatize and stem preprocess steps on the dataset. 
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [18]:
# Example of preprocessing

doc_sample = documents[documents['index'] == 4310].values[0][0]
print('documents example', documents)
print('documents parsed to the index',documents[documents['index'] == 4310])
print('documents parsed to values', documents[documents['index'] == 4310].values)
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

documents example                                              headline_text    index
0        aba decides against community broadcasting lic...        0
1           act fire witnesses must be aware of defamation        1
2           a g calls for infrastructure protection summit        2
3                 air nz staff in aust strike for pay rise        3
4            air nz strike to affect australian travellers        4
...                                                    ...      ...
1103658  the ashes smiths warners near miss liven up bo...  1103658
1103659            timelapse: brisbanes new year fireworks  1103659
1103660           what 2017 meant to the kids of australia  1103660
1103661   what the papodopoulos meeting may mean for ausus  1103661
1103662  who is george papadopoulos the former trump ca...  1103662

[1103663 rows x 2 columns]
documents parsed to the index                     headline_text  index
4310  rain helps dampen bushfires   4310
original document: 
['rain

NameError: name 'stemmer' is not defined