In [27]:
# for organizing/storing data
import pandas as pd
# for language analysis magic
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


In [28]:
# reading contents
article_contents = pd.read_csv("./data/article.csv", sep=',', header='infer')

In [32]:
# useful functions from:
# https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [40]:
# preprocessing data
for article_content in article_contents["content"]:
    article_words = nltk.word_tokenize(article_content)
    normalized_article_words = normalize(article_words)
    normalized_article_words_postags = nltk.pos_tag(normalized_article_words)
    print(normalized_article_words_postags)

[('president', 'NN'), ('would', 'MD'), ('nt', 'VB'), ('say', 'VB'), ('would', 'MD'), ('definitely', 'RB'), ('declare', 'VB'), ('emergency', 'NN'), ('told', 'VBD'), ('reporters', 'NNS'), ('declaration', 'NN'), ('help', 'NN'), ('process', 'NN')]


In [38]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mavram/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True