# Sentiment Analysis of News Articles Using NLTK

### 1. Importing Libraries

In [113]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import nltk
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

In [135]:
STOP_WORDS= set(stopwords.words("english")+list(string.punctuation))

### 2. Reading prepared Dataset

In [116]:
master=pd.read_csv("../Datasets/master_sentiment.csv")

### 3. Cleaning the Data

#### 3.1. Checking repeat articles

In [134]:
def remove_repeat_articles(articles): 
    count_and_index=dict()
    for index, row in master.iterrows():
        try:
            count_and_index[ row["title"] ]["count"]+=1
            count_and_index[ row["title"] ]["indices"].append(index)
        except:
            count_and_index[ row["title"] ]=dict()
            count_and_index[ row["title"] ]["count"]=1
            count_and_index[ row["title"] ]["indices"]=[index]

    for key, element in count_and_index.items():
        if element["count"]>1:
            droplist.extend( element["indices"][1:] )
  
    master_clean = master.drop(master.index[droplist])
    master_clean.reset_index(inplace=True, drop=True)
    
    return master_clean
  

master_remove_repeat = remove_repeat_articles(master)

master_remove_repeat.head()

Unnamed: 0,link,published,title,text,summary,keywords,sentiment
0,https://www.washingtonpost.com/news/worldviews...,"Sun, 1 Jul 2018 10:03:49 GMT",Here’s what you need to know about Mexico’s pr...,\n\nMexican presidential candidate Andrés Manu...,Mexican presidential candidate Andrés Manuel L...,"obrador, president, need, heres, know, trump, ...",0
1,https://www.cnn.com/2018/07/01/asia/china-aust...,"Sun, 1 Jul 2018 12:28:00 GMT",Thailand cave search: Divers close in on missi...,Chiang Rai (CNN) China and Australia have join...,Chiang Rai (CNN) China and Australia have join...,"team, coach, close, rescue, missing, boys, cav...",1
2,https://www.yahoo.com/news/n-korea-aiming-hide...,"Sun, 1 Jul 2018 10:15:23 GMT",N. Korea aiming to hide ongoing nuclear produc...,The assessment comes on the heels of a landmar...,Over the weekend NBC News first reported that ...,"ongoing, weapons, n, nuclear, hide, washington...",0
3,https://www.washingtonpost.com/news/worldviews...,"Sat, 30 Jun 2018 22:41:15 GMT",Read U.S. ambassador to Estonia's resignation ...,\n\nJames D. Melville Jr. addresses dignitarie...,James D. Melville Jr. addresses dignitaries in...,"president, resignation, melville, read, estoni...",0
4,https://www.yahoo.com/news/rebels-resume-peace...,"Sun, 1 Jul 2018 11:35:00 GMT",Jordan seeks truce for southwest Syria after a...,Trucks loaded with humanitarian supplies to be...,Trucks loaded with humanitarian supplies to be...,"seeks, rebel, towns, states, syria, army, unit...",0


### 4. Preliminary Processing 

In [139]:
def tokenization_remove_stopwords(text):
    words_tokenized= word_tokenize(text)
    words_sans_stopwords = [w for w in words_tokenized if w not in STOP_WORDS]
    return words_sans_stopwords

In [140]:
sample=master.loc[1,"text"]
sample_refine= tokenization_remove_stopwords(sample)
sample_refine

['Chiang',
 'Rai',
 'CNN',
 'China',
 'Australia',
 'joined',
 'search',
 '12',
 'boys',
 'missing',
 'Thai',
 'cave',
 'divers',
 'Sunday',
 'closed',
 'spot',
 'believe',
 'teens',
 'sheltering',
 'The',
 'international',
 'rescue',
 'operation',
 '--',
 'includes',
 '1,000',
 'people',
 'Thai',
 'emergency',
 'services',
 'US',
 'Military',
 'British',
 'cave',
 'experts',
 '--',
 'ramping',
 'efforts',
 'since',
 'boys',
 '25-year-old',
 'soccer',
 'coach',
 'disappeared',
 'outing',
 'caves',
 'northern',
 'Thailand',
 'eight',
 'days',
 'ago',
 'The',
 'missing',
 'boys',
 'coach',
 'seen',
 'photo',
 'taken',
 'coach',
 "'s",
 'Facebook',
 'page',
 'Divers',
 'closing',
 'spot',
 'believe',
 'missing',
 'boys',
 'sheltering',
 'Tham',
 'Luang',
 'Nang',
 'Non',
 'cave',
 'system',
 'The',
 'elevated',
 'dry',
 'area',
 'called',
 'Pattaya',
 'Beach',
 'several',
 'kilometers',
 'entrance',
 'cave',
 'Water',
 'flooded',
 'cave',
 'receded',
 'recent',
 'days',
 'allowing',
 'res

In [70]:
sample

"Chiang Rai (CNN) China and Australia have joined the search for 12 boys missing in a Thai cave, as divers on Sunday closed in on the spot they believe the teens are sheltering.\n\nThe international rescue operation -- which includes over 1,000 people from Thai emergency services, the US Military and British cave experts -- has been ramping up its efforts since the boys and their 25-year-old soccer coach disappeared during an outing in the caves in northern Thailand eight days ago.\n\nThe missing boys, with their coach, are seen here in a photo taken from the coach's Facebook page.\n\nDivers are now closing in on the spot where they believe the missing boys are sheltering in the Tham Luang Nang Non cave system.\n\nThe elevated dry area, called Pattaya Beach, is several kilometers from the entrance of the cave.\n\nWater from the flooded cave has receded in recent days, allowing rescue teams to gain ground, according to Chiang Rai province governor Narongsak Osatanakorn.\n\nRead More"

In [87]:
stop_words= set(stopwords.words("english")+list(string.punctuation))
words_without_stopwords= [w for w in words_sample if w not in stop_words]
len(words_without_stopwords)

106