In [34]:
# for organizing/storing data
import pandas as pd
# for language analysis magic
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [3]:
# reading contents
article_contents = pd.read_csv("./data/article.csv", sep=',', header='infer')

In [45]:
# useful functions from:
# https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
        else:
            new_words.append("No normalized version")
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word == "No normalized version":
            new_words.append("No normalized version")
            continue
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
        else:
            new_words.append("No normalized version")
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [50]:
# Preparing data structure to hold all sentence/word features/classifications
sent_feats = {}

# preprocessing data
for article_content in article_contents["content"]:
    sent_feats["canonical_sentence"] = article_content
    sent_feats["word_level_features"] = {}
    # tokenization
    article_words = nltk.word_tokenize(article_content)
    for word_ix, word in enumerate(article_words):
        sent_feats["word_level_features"][word_ix] = {}
        sent_feats["word_level_features"][word_ix]["canonical"] = word
    # normalization, e.g. stemming
    normalized_article_words = normalize(article_words)
    for word_ix, word in enumerate(normalized_article_words):
        sent_feats["word_level_features"][word_ix]["normalized"] = word
    # part of speech tagging
    normalized_article_words_postags = nltk.pos_tag(article_words)
    for word_ix, postag_tuple in enumerate(normalized_article_words_postags):
        # 0 index is the word, 1 index is the pos tag
        sent_feats["word_level_features"][word_ix]["pos_tag"] = postag_tuple[1]

In [53]:
# Sentiment Analysis Using Vader: https://github.com/cjhutto/vaderSentiment
sentiment_analyzer = SentimentIntensityAnalyzer()
for article_content in article_contents["content"]:
    sentence_level_sentiment = \
        sentiment_analyzer.polarity_scores(article_content)
    sent_feats["sentence_sentiment"] = sentence_level_sentiment
    for key, word in sent_feats["word_level_features"].items():
        sent_feats["word_level_features"][key]["sentiment"] = \
            sentiment_analyzer.polarity_scores(sent_feats["word_level_features"][key]["canonical"])
print(sent_feats)

{'word_level_features': {0: {'canonical': 'The', 'normalized': 'No normalized version', 'pos_tag': 'DT', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 1: {'canonical': 'president', 'normalized': 'president', 'pos_tag': 'NN', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 2: {'canonical': 'would', 'normalized': 'would', 'pos_tag': 'MD', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 3: {'canonical': "n't", 'normalized': 'nt', 'pos_tag': 'RB', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 4: {'canonical': 'say', 'normalized': 'say', 'pos_tag': 'VB', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 5: {'canonical': 'he', 'normalized': 'No normalized version', 'pos_tag': 'PRP', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 6: {'canonical': 'would', 'normalized': 'would', 'pos_tag': 'MD', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 7: 