# Sentiment Analysis of News Articles Using NLTK

### 1. Importing Libraries

In [252]:
import pandas as pd
import random
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import pos_tag
import numpy as np
import random
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [188]:
# Creating a wordnet lemmatizer instance
lemmatizer=WordNetLemmatizer()
#Getting all stopwords from corpus and including punctuations 
STOP_WORDS= set(stopwords.words("english")+list(string.punctuation) + ["’","—",'“','”','``',"''",'©',"'s" ])
STOP_WORDS

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "'s",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '``',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'm

### 2. Reading prepared Dataset

In [189]:
# Reading dataset with sentiment classification and 
master=pd.read_csv("../Datasets/master_sentiment.csv")
# Dropping 17 and 36 as they have NaN articles
master_rem_na=master.drop([17,36])
master_rem_na.reset_index(inplace=True, drop=True)
master_rem_na.head(5)

Unnamed: 0,link,published,title,text,summary,keywords,sentiment
0,https://www.washingtonpost.com/news/worldviews...,"Sun, 1 Jul 2018 10:03:49 GMT",Here’s what you need to know about Mexico’s pr...,\n\nMexican presidential candidate Andrés Manu...,Mexican presidential candidate Andrés Manuel L...,"obrador, president, need, heres, know, trump, ...",0
1,https://www.cnn.com/2018/07/01/asia/china-aust...,"Sun, 1 Jul 2018 12:28:00 GMT",Thailand cave search: Divers close in on missi...,Chiang Rai (CNN) China and Australia have join...,Chiang Rai (CNN) China and Australia have join...,"team, coach, close, rescue, missing, boys, cav...",1
2,https://www.yahoo.com/news/n-korea-aiming-hide...,"Sun, 1 Jul 2018 10:15:23 GMT",N. Korea aiming to hide ongoing nuclear produc...,The assessment comes on the heels of a landmar...,Over the weekend NBC News first reported that ...,"ongoing, weapons, n, nuclear, hide, washington...",0
3,https://www.washingtonpost.com/news/worldviews...,"Sat, 30 Jun 2018 22:41:15 GMT",Read U.S. ambassador to Estonia's resignation ...,\n\nJames D. Melville Jr. addresses dignitarie...,James D. Melville Jr. addresses dignitaries in...,"president, resignation, melville, read, estoni...",0
4,https://www.yahoo.com/news/rebels-resume-peace...,"Sun, 1 Jul 2018 11:35:00 GMT",Jordan seeks truce for southwest Syria after a...,Trucks loaded with humanitarian supplies to be...,Trucks loaded with humanitarian supplies to be...,"seeks, rebel, towns, states, syria, army, unit...",0


### 3. Cleaning the Data

In [190]:
# This function is to convert from wordet POS_Tag to WordNetLemmatizer format
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

# This function counts the articles with the same title and removes the repeated articles.   
def remove_repeat_articles(articles): 
    count_and_index=dict()
    for index, row in articles.iterrows():
        try:
            count_and_index[ row["title"] ]["count"]+=1
            count_and_index[ row["title"] ]["indices"].append(index)
        except:
            count_and_index[ row["title"] ]=dict()
            count_and_index[ row["title"] ]["count"]=1
            count_and_index[ row["title"] ]["indices"]=[index]
    droplist=list()
    for key, element in count_and_index.items():
        if element["count"]>1:
            droplist.extend( element["indices"][1:] )
  
    master_clean = articles.drop(master.index[droplist])
    master_clean.reset_index(inplace=True, drop=True)
    
    return master_clean
  

master_remove_repeat = remove_repeat_articles(master_rem_na)



### 4. Preliminary Processing (Tokenization and Lemmatization)

In [191]:
# This function tokenizes title and article into words and removes stop words and punctutations
def tokenization_remove_stopwords(text):
    words_tokenized= word_tokenize(text)
    words_sans_stopwords = [w.lower() for w in words_tokenized if w not in STOP_WORDS]
    return words_sans_stopwords
#Finding pos_tag for each word and lemmatization using pos_tag as an attribute
def pos_lemmatization(text):
    text_pos=pos_tag(text)
    text_wn_pos=[(x,get_wordnet_pos(y)) for x,y  in text_pos]
    text_lemmatized = [ lemmatizer.lemmatize(x,pos=y) for x,y in text_wn_pos]
    return text_lemmatized


In [216]:
# In this cell, we get the processed text and title. We then club them together in a tuple using the zip function.
print(title_tokenized[0:10])
# First we tokenize and remove stopwords and punctuations
text_tokenized =master_remove_repeat["text"].apply(tokenization_remove_stopwords) 
title_tokenized =  master_remove_repeat["title"].apply(tokenization_remove_stopwords) 
labels=master_remove_repeat["sentiment"]
# Then we lemmatize the processed sentences
title_lemmatized = [pos_lemmatization(title) for title in title_tokenized]
text_lemmatized = [pos_lemmatization(article) for article in text_tokenized]

# We zip the title and text into tuples
title_text_labels_lemmatized=list( zip(title_lemmatized, text_lemmatized, labels) )

0    [here, need, know, mexico, presidential, elect...
1    [thailand, cave, search, divers, close, missin...
2    [n., korea, aiming, hide, ongoing, nuclear, pr...
3    [read, u.s., ambassador, estonia, resignation,...
4    [jordan, seeks, truce, southwest, syria, army,...
5    [redoine, faid, paris, helicopter, prison, bre...
6    [thousands, protest, hong, kong, anniversary, ...
7    [11, family, members, are, found, dead, near, ...
8    [american, tourist, dead, bahamas, boat, explo...
9    [iran, eyes, private, oil, exports, help, beat...
Name: title, dtype: object


### 5. Feature Engineering

In [222]:
# A function that outputs a list of all words in the dataset
def all_words(lemmatized_dataset):
    return [word for text in lemmatized_dataset for word in text]


text_lemmatized_all = all_words(text_lemmatized)
title_lemmatized_all= all_words(title_lemmatized)
text_lemmatized_all_freq = nltk.FreqDist(text_lemmatized_all)
word_features= [x for x,y in text_lemmatized_all_freq.most_common(3000)]

In [223]:
# Function that finds feature values in a document
def find_features(document):
    words= set(document)
    features={}
    for w in word_features:
        features[w] = (w in words)
    return features
print( find_features(text_lemmatized[0]) )

{'say': True, 'the': True, 'i': False, '2018': False, 'a': True, 'year': True, 'like': True, 'u': False, 'people': True, 'one': False, 'go': True, 'day': False, 'mr': False, 'make': True, 'take': True, 'it': True, 'get': False, 'time': True, 'new': False, 'we': False, 'tell': False, 'nine': True, 'he': True, 'help': True, 'digital': False, 'good': True, 'ltd': False, 'pty': False, 'news': False, 'share': False, 'also': True, 'child': False, 'post': False, 'come': True, 'first': True, 'would': True, 'two': False, "n't": False, 'may': True, 'june': False, 'work': True, 'woman': False, 'm': False, 'family': True, 'see': True, 'life': False, '–': False, 'know': True, 'president': True, 'give': True, 'story': False, 'use': False, 'show': False, 'in': True, 'world': True, 'last': True, 'could': True, 'police': True, 'back': False, 'since': True, 'want': True, 'week': False, 'find': True, 'you': False, 'country': True, 'facebook': False, 'way': False, 'trump': True, 'but': True, 'home': False

In [275]:
# Creating featureset using article words and labels 
# and classifying whether article contains the word in word_features (True or False)
featuresets=[(find_features(text), label) for (title, text, label) in title_text_labels_lemmatized]

In [278]:
# Shuffling featuresets
np.random.seed(111)
np.random.shuffle(featuresets)

In [277]:
# Preparing traing
training_set=featuresets[:1200]
testing_set=featuresets[1200:]

In [281]:
classifier= nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier,training_set) ) *100 )

Naive Bayes Algo accuracy percent: 93.75


In [283]:
classifier.show_most_informative_features(100)

Most Informative Features
               agreement = True                0 : 1      =    168.4 : 1.0
                      eu = True                0 : 1      =    116.6 : 1.0
                 spiegel = True                0 : 1      =     80.3 : 1.0
               tolerance = True                0 : 1      =     75.1 : 1.0
                    u.s. = True                0 : 1      =     74.2 : 1.0
                  import = True                0 : 1      =     70.0 : 1.0
                  merkel = True                0 : 1      =     70.0 : 1.0
                  export = True                0 : 1      =     64.8 : 1.0
                  senate = True                0 : 1      =     64.8 : 1.0
                     der = True                0 : 1      =     59.6 : 1.0
              right-wing = True                0 : 1      =     59.6 : 1.0
               afp/getty = True                0 : 1      =     54.4 : 1.0
             policewoman = True                0 : 1      =     54.4 : 1.0

In [288]:
160/1371

0.11670313639679067