# Problem Statement:
### Please go through https://somegreymatters.wordpress.com/2016/12/08/sentiment-and-emotion-analysis-of-tweets-regarding-demonetization/
You can see tags associated with the post (Tags: analysis, blackmoney, demonetization, emotion, noteban, sentiment, twitter). Your task is to re-generate these tags and maybe some more relevant ones for the same blog post using NLP/ML techniques in the programming language you are comfortable working with. 

In [7]:

from nltk.corpus import wordnet as wn
#for pulling data out of HTML and XML files.
from bs4 import BeautifulSoup

#HTTP library for pulling pushing and authenticating
import requests

#for Regular expression operations
import re

#comparison, addition, greater than less then
import operator

#system calls, deal with user arguments
import sys

#list of common stop words various languages like the
from stop_words import get_stop_words
import nltk
import string

# 1. Web scraping to fetch the data

In [61]:

#get the words
def getWordList(url):
    word_list = []
    #raw data
    source_code = requests.get(url)
    #convert to text
    plain_text = source_code.text
    #lxml format
    soup = BeautifulSoup(plain_text,'lxml')

    #find the words in paragraph tag
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        #content
        content = text.text
        #lowercase and split into an array
        words = content.lower().split()

        #for each word
        for word in words:
            #remove non-chars
            cleaned_word = clean_word(word)
            #if there is still something there
            if len(cleaned_word) > 0:
                #add it to our word list
                word_list.append(cleaned_word)

    return word_list


#clean word with regex
def clean_word(word):
    cleaned_word = re.sub('[0-9#/+/%/-]+', '', word)
    return cleaned_word

url = "https://somegreymatters.wordpress.com/2016/12/08/sentiment-and-emotion-analysis-of-tweets-regarding-demonetization/"

#try-except block. simple way to deal with exceptions 
#great for HTTP requests
try:
    response = requests.get(url)
    page_word_list = getWordList(url)
    stop_words = get_stop_words('en')

    temp_list = [] #to get word list present on website
    for word in page_word_list:
            if word not in stop_words:
                temp_list.append(str(word))
    print("TEMP_LIST::", str(temp_list))
    print(len(temp_list))
    text = " ".join(str(x) for x in temp_list)
    
#throw an exception in case it breaks
except requests.exceptions.Timeout:
    print("The server didn't respond. Please, try again later.")

('TEMP_LIST::', "['nirmal', 'kumar', 's,', 'sakthi', 'balan', 'm,', 'pushkal', 'agarwal,', 'lokesh', 'todwal', 'department', 'computer', 'science', 'engineering,', 'lnmiit,', 'jaipur,', 'india', 'collected', 'tweets', 'regarding', 'three', 'hashtags', 'blackmoney,', 'indiafightscorruption,', 'blackmoneycleanup', 'th', 'november', 'th', 'november', '.', 'got', 'around', '.', 'lakh', 'tweets', 'blackmoney', 'posted', 'around', 'users,', 'around', 'lakh', 'tweets', 'indiafightscorruption', 'posted', 'around', 'users,', 'around', 'tweets', 'blackmoneycleanup', 'posted', 'around', 'users.', 'makes', 'total', 'around', '.', 'lakh', 'tweets', 'posted', 'around', '.', 'lakh', 'users.', 'entire', 'tweets', 'blackmoneycleanup', 'representative', 'random', 'sample', 'total', 'tweets', 'two', 'hashtags', 'taken', 'analysis.', 'outcomes', 'analysis.', 'number', 'tweets', 'positive,', 'negative', 'neutral', 'polarities', 'given', 'table', 'plotted', 'figure', '.', 'table', 'polaritywise', 'distribut

In [62]:
text

'nirmal kumar s, sakthi balan m, pushkal agarwal, lokesh todwal department computer science engineering, lnmiit, jaipur, india collected tweets regarding three hashtags blackmoney, indiafightscorruption, blackmoneycleanup th november th november . got around . lakh tweets blackmoney posted around users, around lakh tweets indiafightscorruption posted around users, around tweets blackmoneycleanup posted around users. makes total around . lakh tweets posted around . lakh users. entire tweets blackmoneycleanup representative random sample total tweets two hashtags taken analysis. outcomes analysis. number tweets positive, negative neutral polarities given table plotted figure . table polaritywise distribution tweets polarity negative figure plot polaritywise distribution tweets positive dominant polarity. means majority people tweeted favor demonetization. second dominant polarity negative, number tweets negative polarity less half tweets positive polarity. interestingly, hashtags show tr

# 2. Some more relevant tags for the same blog post using NLP/ML technique
## (a)Used Rapid Automatic Keyword Extraction (RAKE) algorithm that extracts keywords from text, by identifying runs of non-stopwords and then scoring these phrases across the document.

In [71]:
def isPunct(word):
    return len(word) == 1 and word in string.punctuation

def isNumeric(word):
    try:
        float(word) if '.' in word else int(word)
        return True
    except ValueError:
        return False
    
def _generate_candidate_keywords(sentences):
    phrase_list = []
    stopwords = stop_words = get_stop_words('en') #set(nltk.corpus.stopwords.words())
    for sentence in sentences:
        words = map(lambda x: "|" if x in stopwords else x, nltk.word_tokenize(sentence.lower()))
        phrase = []
        for word in words:
            if not(word == "|" and isPunct(word)):
                phrase_list.append(word)
    return phrase_list


def _calculate_word_scores(phrase_list):
    word_freq = nltk.FreqDist()
    word_degree = nltk.FreqDist()
    for phrase in phrase_list:
        degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
        for word in phrase:
            word_freq[word]+=1
            word_degree[word]+= degree # other words
    for word in word_freq.keys():
        word_degree[word] = word_degree[word] + word_freq[word] # itself
        # word score = deg(w) / freq(w)
    word_scores = {}
    for word in word_freq.keys():
        word_scores[word] = word_degree[word] / word_freq[word]
    return word_scores

def _calculate_phrase_scores(phrase_list, word_scores):
    phrase_scores = {}
    for phrase in phrase_list:
        phrase_score = 0
        for word in phrase:
            phrase_score += word_scores[word]
        phrase_scores["".join(phrase)] = phrase_score
    return phrase_scores

def extract(text, incl_scores=False):
    top_fraction = 1 # consider top third candidate keywords by score
    sentences = nltk.sent_tokenize(text)
    phrase_list = _generate_candidate_keywords(sentences)
    word_scores = _calculate_word_scores(phrase_list)
    phrase_scores = _calculate_phrase_scores(phrase_list, word_scores)
    sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
    n_phrases = len(sorted_phrase_scores)
    print(n_phrases)
    if incl_scores:
        return sorted_phrase_scores[0:int(n_phrases/top_fraction)]
    else:
        return map(lambda x: x[0], sorted_phrase_scores[0:int(n_phrases/top_fraction)])

In [72]:
Textracted = extract(text, incl_scores=True)
Fextracted = extract(text)
print(Fextracted)

127
127
['indiafighytscorruption', 'indiafightscorruption', 'blackmoneycleanup', 'demonetization', 'characteristic', 'representative', 'interestingly', 'distribution', 'polaritywise', 'wordpress.com', 'respectively', 'combination', 'considering', 'engineering', 'emotionwise', 'connecting', 'blackmoney', 'commenting', 'polarities', 'department', 'regarding', 'different', 'collected', 'emotional', 'polarity', 'proposed', 'dominant', 'facebook', 'computer', 'accepted', 'positive', 'emotions', 'outcomes', 'analysis', 'comments', 'majority', 'november', 'negative', 'account', 'science', 'emotion', 'hashtags', 'figures', 'plotted', 'pushkal', 'disgust', 'neutral', 'details', 'twitter', 'notify', 'agarwal', 'around', 'lnmiit', 'random', 'tweeted', 'express', 'however', 'nirmal', 'figure', 'people', 'second', 'number', 'jaipur', 'entire', 'posted', 'widely', 'google', 'trends', 'change', 'sakthi', 'todwal', 'sample', 'tables', 'lokesh', 'click', 'india', 'showed', 'basic', 'tweets', 'using', '

In [73]:
top_fraction = 1 # consider top third candidate keywords by score
sentences = nltk.sent_tokenize(text)
phrase_list = _generate_candidate_keywords(sentences)
word_scores = _calculate_word_scores(phrase_list)
phrase_scores = _calculate_phrase_scores(phrase_list, word_scores)

In [75]:
_calculate_word_scores(phrase_list)

{'(': 1,
 ')': 1,
 ',': 1,
 '.': 1,
 ':': 1,
 'a': 8,
 'b': 9,
 'c': 10,
 'd': 9,
 'e': 7,
 'f': 9,
 'g': 8,
 'h': 7,
 'i': 10,
 'j': 4,
 'k': 9,
 'l': 8,
 'm': 8,
 'n': 9,
 'o': 9,
 'p': 10,
 'r': 9,
 's': 7,
 't': 8,
 'u': 9,
 'v': 7,
 'w': 6,
 'x': 5,
 'y': 10,
 'z': 14}

In [76]:
_calculate_phrase_scores(phrase_list, word_scores)

{'(': 1,
 ')': 1,
 ',': 1,
 '.': 1,
 ':': 1,
 'accepted': 69,
 'account': 63,
 'agarwal': 55,
 'also': 32,
 'analysis': 67,
 'anger': 41,
 'around': 53,
 'balan': 42,
 'basic': 44,
 'blackmoney': 87,
 'blackmoneycleanup': 148,
 'can': 27,
 'case': 32,
 'change': 49,
 'characteristic': 121,
 'click': 47,
 'collected': 76,
 'combination': 99,
 'commenting': 86,
 'comments': 66,
 'computer': 70,
 'connecting': 89,
 'considering': 97,
 'demonetization': 125,
 'department': 83,
 'details': 57,
 'different': 77,
 'disgust': 58,
 'distribution': 107,
 'dominant': 70,
 'ekman': 41,
 'email': 41,
 'emotion': 60,
 'emotional': 76,
 'emotions': 67,
 'emotionwise': 90,
 'engineering': 93,
 'entire': 50,
 'express': 52,
 'facebook': 70,
 'far': 26,
 'favor': 42,
 'fear': 33,
 'figure': 52,
 'figures': 59,
 'fill': 35,
 'given': 41,
 'google': 49,
 'got': 25,
 'half': 32,
 'hashtags': 60,
 'however': 52,
 'human': 41,
 'icon': 38,
 'india': 46,
 'indiafightscorruption': 187,
 'indiafighytscorruption

### (b) Using Word Ranking: It ranks the words of a text file according to their frequency. It puts the words in a dictionary; each word is the key, and the frequency is the value. It then prints the dictionary in ascending order of frequency.

In [83]:


text = text
l = {}
for word in text.split():
    # make everything lowercase
    word = word.lower()
    
    # check for words in () or []
    if word.startswith('[') or word.startswith('('):
        word = word[1:]
    if word.endswith(']') or word.endswith(')'):
        word = word[:-1]
        
    # check for words that end with punctuation
    if word.endswith('.') or word.endswith(',') or word.endswith(';') or word.endswith(':'):
        word = word[:-1]
    # if word is in dictionary, increment the value
    # otherwise add the word to dictionary with value 1
    if word in l:
        l[word] += 1
    else:
        l[word] = 1
        
# this prints the dict out sorted by value
for key, value in sorted(l.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    print '%s: %s' % (key, value)

tweets: 27
: 16
distribution: 10
negative: 9
polaritywise: 8
around: 8
table: 7
figure: 7
dominant: 7
blackmoneycleanup: 7
number: 6
hashtags: 6
emotional: 6
blackmoney: 6
show: 5
positive: 5
polarity: 5
log: 5
joy: 5
indiafightscorruption: 5
using: 4
users: 4
second: 4
posted: 4
neutral: 4
lakh: 4
emotions: 4
emotion: 4
commenting: 4
change: 4
anger: 4
account: 4
three: 3
plotted: 3
less: 3
combination: 3
trend: 2
total: 2
th: 2
see: 2
s: 2
respectively: 2
november: 2
here: 2
half: 2
given: 2
express: 2
emotionwise: 2
different: 2
can: 2
analysis: 2
wordpress.com: 1
widely: 1
via: 1
two: 1
twitter: 1
tweeted: 1
trends: 1
todwal: 1
taken: 1
tables: 1
six: 1
shown: 1
showed: 1
science: 1
sample: 1
sakthi: 1
representative: 1
regarding: 1
random: 1
pushkal: 1
proposed: 1
polarities: 1
plot: 1
people: 1
paul: 1
outcomes: 1
notify: 1
nirmal: 1
new: 1
much: 1
means: 1
makes: 1
majority: 1
m: 1
lokesh: 1
lnmiit: 1
kumar: 1
jaipur: 1
interestingly: 1
indiafighytscorruption: 1
india: 1
in: 1
i

# Extra effort
## 3. Function that will be used to find similarity/ relevancy

In [78]:


##LCH_SIMILARITY
def sim1(word1, word2, lch_threshold=2.00):
    """To find words are similar/relevant or not.
       lch_threshold: can change it to have different kind of result. More LCH, more is the similarity. 
                      Value is very application dependent.
    """
    results = []
    for net1 in wn.synsets(word1):
        for net2 in wn.synsets(word2):
            try:
                lch = net1.lch_similarity(net2)
            except:
                continue
                
            if lch >= lch_threshold:
                results.append([net1, net2])
    if not results:
        return False
    #print(results)
    return True


##PATH_SIMILARITY
def sim2(word1, word2, path_threshold=0.5):
    """To find words are similar/relevant or not.
       lch_threshold: can change it to have different kind of result. More LCH, more is the similarity. 
                      Value is very application dependent.
    """
    results = []
    for net1 in wn.synsets(word1):
        for net2 in wn.synsets(word2):
            try:
                lch = net1.path_similarity(net2)
            except:
                continue
                
            if lch >= path_threshold:
                results.append([net1, net2])
    if not results:
        return False
    #print(results)
    return True


##WUP_SIMILARITY
def sim3(word1, word2, wup_threshold=0.5):
    """To find words are similar/relevant or not.
       lch_threshold: can change it to have different kind of result. More LCH, more is the similarity. 
                      Value is very application dependent.
    """
    results = []
    for net1 in wn.synsets(word1):
        for net2 in wn.synsets(word2):
            try:
                lch = net1.wup_similarity(net2)
            except:
                continue
            
            if lch >= wup_threshold:
                results.append([net1, net2])
    if not results:
        return False
    #print(results)
    return True

In [79]:
tags=Fextracted[0:6]

'indiafightscorruption'

In [80]:
print("::::::::::::Relevant words from the given document is:::::::::")
for tag in tags:
    l=[]
    for word in set(temp_list):
        if(sim1(tag, word) and tag != word):
                l.append(word)
    print(tag+":"+str(l))

::::::::::::Relevant words from the given document is:::::::::
indiafighytscorruption:[]
indiafightscorruption:[]
blackmoneycleanup:[]
demonetization:['change']
characteristic:['lakh', 'half', 'favor', 'change', 's', 'number', 'three', 'total', 'figure', 'two', 'six', 'department', 'figures']
representative:['details', 'science', 'computer', 'joy', 'neutral', 'case', 'sample']
