<a href="https://colab.research.google.com/github/ocampos16/AdvancedProject1/blob/master/Environmental%20Policies%20Hashtag%20Team/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## download libraries

In [2]:
!pip install emoji --upgrade
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/40/8d/521be7f0091fe0f2ae690cc044faf43e3445e0ff33c574eae752dd7e39fa/emoji-0.5.4.tar.gz (43kB)
[K     |███████▌                        | 10kB 17.3MB/s eta 0:00:01[K     |███████████████                 | 20kB 1.8MB/s eta 0:00:01[K     |██████████████████████▋         | 30kB 2.2MB/s eta 0:00:01[K     |██████████████████████████████▏ | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 1.7MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.5.4-cp36-none-any.whl size=42176 sha256=e4f1f04666d924876fa4a2c6e2584c6d2d6ae69718f40e7c2988fda941f38eb5
  Stored in directory: /root/.cache/pip/wheels/2a/a9/0a/4f8e8cce8074232aba240caca3fade315bb49fac68808d1a9c
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4
[nltk_data] Downloading package sto

True

## Import necessary libraries for data cleaning, sentiment analysis and word embedding

In [0]:
import re
import json
import emoji

import pickle
from datetime import datetime

import numpy as np
import pandas as pd

from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet as wn, sentiwordnet as swn
from nltk.stem import WordNetLemmatizer

## Define dictionary for contractions of words

In [0]:
# Dictionary of contarctions. source for contractions: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
contractions_dict = {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have"
        }

##Read json formatted raw data as a DataFrame

In [5]:
# Input data files are available in the "dataset/" directory.
# Read the input raw data and parse only the necessary columns from the json file.

def populate_tweet_df(tweets):
    """
    This function takes tweets list as argument and returns a dataframe of the tweets.
    """
    df = pd.DataFrame()
    df['date'] = [datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y') for tweet in tweets if (tweet['lang'] == 'en')]
    df['text'] = [tweet['text'] for tweet in tweets if (tweet['lang'] == 'en')]
    # df['text'] = list(map(lambda tweet:tweet['text'], filter(lambda tweet: tweet['lang']=='en', tweets)))
    return df

if __name__ == '__main__':
    tweet_file = ("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/tweets_parisagreement_09-03-2020.txt")
    tweets = []
    with open(tweet_file, 'r') as file:
        for line in file.readlines():
            tweets.append(json.loads(line))
    tweets_df = populate_tweet_df(tweets)
    print("Executed Successfully")


Executed Successfully


In [7]:
tweets_df.head()

Unnamed: 0,date,text
0,2020-03-09 13:49:46+00:00,One need World to make #MAGA .\nWe need #Paris...
1,2020-03-09 13:26:18+00:00,SLOW DOWN\nGlobal #CO2 emissions from the #pow...
2,2020-03-09 13:21:51+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...
3,2020-03-09 13:21:42+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...
4,2020-03-09 13:20:13+00:00,This Wednesday 11 March Switzerland &amp; Ghan...


##Data Cleaning

In [6]:
# Setting stopwords
all_stopwords = set(stopwords.words('english'))
exclude_stopwords = ['no', 'not']
for sw in exclude_stopwords:
    all_stopwords.remove(sw)

# Initializing Lemmatizer
lemmatizer = WordNetLemmatizer()

def toLowerCase(text):
    '''
    Returns the text in lowercase.
    '''
    return text.lower()


def removeUrls(text):
    # return re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', text)
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b','', text)

def removeSpecialChar(text):
    '''
    Removes special characters which are generally found in tweets.
    '''
    # Convert @username to empty strings
    text = re.sub('@[^\s]+', '', text)

    # Remove 'RT' from retweets
    text = re.sub(r'\brt\b', '', text)

    # Remove additional white spaces
    text = re.sub('[\s]+', ' ', text)

    # Replace #hastags with word
    text = re.sub(r'#([^\s]+)', r'\1', text)

    # Trims the tweet
    text = text.strip('\'"')

    return text


def removeNonAlpha(text):
    '''
    Remove all characters which are not alphabets, numbers or whitespaces.
    '''
    return re.sub('[^A-Za-z0-9 ]+', '', text)


def removeStopWords(text):
    # Remove stop words using NLTK's list of stop words in the corpus module. 
    tokens_without_sw = [word for word in word_tokenize(text) if not word in all_stopwords]

    return(' '.join(tokens_without_sw))


def handleEmojis(text):
    '''
    This function handles sentiments expressed through emoticons by using Python's 'emoji package'.
    Replace all emoticons with the expression they represent in plain English.
    '''
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = text.replace("_"," ")
    text = ' '.join(text.split())
    return text


def handleContractions(text):
    '''
    expand shortened words, e.g. convert "don't" to "do not"
    '''
    contracted_tokens = [contractions_dict[word] if word in contractions_dict else word for word in word_tokenize(text)]
    return " ".join(contracted_tokens)


def get_wordnet_pos(word):
    '''
    Map POS tag to the format wordnet lemmatizer would accept. Returning 'NOUN' as default tag.
    ADJ: adjective,    ADP: adposition,    ADV: adverb,    CONJ: conjunction	
    DET: determiner,article,    NOUN: noun,    NUM: numeral,    PRT: particle
    PRON: pronoun,    VERB: verb,    .: punctuation marks	. , ; !,    X: other	
    '''
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ,
                "N": wn.NOUN,
                "V": wn.VERB,
                "R": wn.ADV}

    return tag_dict.get(tag, wn.NOUN)


def wordLemmatization(text):
    '''
    Lemmatizing the text with the appropriate POS tag
    result: ['he', 'kept', 'eat', 'while', 'we', 'be', 'talk']
    '''
    lemma_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokenize(text)]
    return " ".join(lemma_tokens)

def processTweets(tweet):
    '''
    Process the raw tweets and convert them into usable data for sentiment analysis. 
    '''
    # Convert to lower case
    tweet = toLowerCase(tweet)

    # Remove urls
    tweet = removeUrls(tweet)
    
    # Replacing contractions with full words
    tweet = handleContractions(tweet)
    
    # Replace emoticons with word expressions
    tweet = handleEmojis(tweet)
    
    # Remove all Special characters
    tweet = removeSpecialChar(tweet)
    
    # Remove non alpha characters
    tweet = removeNonAlpha(tweet)
    
    # converting a word to its base form
    tweet = wordLemmatization(tweet)
    
    # Remove stop words
    tweet = removeStopWords(tweet)

    return tweet

if __name__ == '__main__':
    tweets = tweets_df['text']
    processed_tweets = []
    processed_df = pd.DataFrame(columns=['date','text','clean_text'])
    for tweet in tweets:
        processed_tweets.append(processTweets(tweet))
    processed_df['date'] = tweets_df['date']
    processed_df['text'] = tweets_df['text']
    processed_df['clean_text'] = processed_tweets
    print("Executed Successfully")

Executed Successfully


###Save processed dataframe for easy reusability

In [0]:
processed_df_out = open("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/processed_df.pickle","wb")
pickle.dump(processed_df, processed_df_out)

In [7]:
processed_df.head()

Unnamed: 0,date,text,clean_text
0,2020-03-09 13:49:46+00:00,One need World to make #MAGA .\nWe need #Paris...,one need world make maga need parisagreement m...
1,2020-03-09 13:26:18+00:00,SLOW DOWN\nGlobal #CO2 emissions from the #pow...,slow global co2 emission power sector fell 2 l...
2,2020-03-09 13:21:51+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...
3,2020-03-09 13:21:42+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...
4,2020-03-09 13:20:13+00:00,This Wednesday 11 March Switzerland &amp; Ghan...,wednesday 11 march switzerland amp ghana share...


## Word Embedding and Reaplcing words with centroid of the clusters of word vectors

In [0]:
# Import word2vec model
from gensim.models import Word2Vec, KeyedVectors

In [9]:
model = KeyedVectors.load_word2vec_format('/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/GoogleNews-vectors-negative300.bin.gz', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
train_data = processed_df['clean_text']
sentences = [word_tokenize(tweet) for tweet in train_data]

words_filtered = []
word_vectors = []

for words in sentences:
    for word in words:
        if(word in model.vocab):
            word_vectors.append(model[word])
            words_filtered.append(word)

In [0]:
# word_centroid_map = pd.DataFrame(np.array(vector_list), words_filtered)
# word_centroid_map.head()

In [12]:
word_vec_dict = dict(zip(words_filtered, word_vectors))
len(word_vec_dict)

2089

In [0]:
pickle_out = open("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/word_vec_dict.pickle","wb")
pickle.dump(word_vec_dict, pickle_out)

In [0]:
pickle_in = open("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/word_vec_dict.pickle", "rb")
word_vec_dict = pickle.load(pickle_in)

In [0]:
word_vectors = list(word_vec_dict.values())

In [0]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [0]:
num_of_clusters = len(word_vectors)//3

# create kmeans object
kmeans = KMeans(n_clusters=num_of_clusters, random_state=42)

# fit kmeans object to data
km = kmeans.fit(word_vectors)

In [0]:
# # save new clusters
# y_km = kmeans.fit_predict(word_vectors)

In [0]:
word_centroid_dict = {}

In [0]:
word_centroid_dict = dict(zip(list(word_vec_dict.keys()), list(kmeans.fit_predict(word_vectors))))

In [0]:
centroid_words_dict = {}

In [22]:
closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, word_vectors)
for i in range(num_of_clusters-1):
  your_word_vector = word_vectors[closest[i]]
  centroid_words_dict[i] = model.most_similar(positive=[your_word_vector], topn=1)[0][0]

  if np.issubdtype(vec.dtype, np.int):


In [23]:
num_of_clusters, len(word_vectors), len(closest)

(696, 2089, 696)

In [24]:
centroid_words_dict.get(word_centroid_dict.get('light'))

'coalfired'

In [25]:
def replace_words(text):
    repalced_words = []
    sent_token = sent_tokenize(text)
    word_token = [word_tokenize(sent) for sent in sent_token]
    for token in word_token[0]:
        replaced_word = centroid_words_dict.get(word_centroid_dict.get(token))
        if(replaced_word is None):
            replaced_word = token
        repalced_words.append(replaced_word)
    return repalced_words

replaced_words_with_embeddings = []
tweets = processed_df['clean_text']
# for i in range(1):
#     replaced_words_with_embeddings.append(replace_words(tweets[i]))
for tweet in tweets:
    replaced_words_with_embeddings.append(" ".join(replace_words(tweet)))
processed_df['replaced_words_by_word_embedding'] = replaced_words_with_embeddings
processed_df.head()

Unnamed: 0,date,text,clean_text,replaced_words_by_word_embedding
0,2020-03-09 13:49:46+00:00,One need World to make #MAGA .\nWe need #Paris...,one need world make maga need parisagreement m...,one want world bring jamal want parisagreement...
1,2020-03-09 13:26:18+00:00,SLOW DOWN\nGlobal #CO2 emissions from the #pow...,slow global co2 emission power sector fell 2 l...,fast global co2 emission electricity market fe...
2,2020-03-09 13:21:51+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...,undpclimate wednesday 11 march poland amp ugan...
3,2020-03-09 13:21:42+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...,undpclimate wednesday 11 march poland amp ugan...
4,2020-03-09 13:20:13+00:00,This Wednesday 11 March Switzerland &amp; Ghan...,wednesday 11 march switzerland amp ghana share...,wednesday 11 march poland amp uganda itc progr...


#Score Generation

In [35]:
def get_wordnet_pos(word):
    '''
    Convert the PennTreebank tags to simple Wordnet tags
    '''
    tag = pos_tag([word])[0][1][0].upper()

    tag_dict = {"J": wn.ADJ,
                "N": wn.NOUN,
                "V": wn.VERB,
                "R": wn.ADV}

    return tag_dict.get(tag, wn.NOUN)


def sentimentAnalyser(text):
    '''
    This Function generates normalized sentiment score and a sentiment category to the tweet.
        Score is normalized for fairer comparison.
    '''
    # to count no. of words which bear scores
    num_of_words = 0
    sentiment_score = 0
    final_score = 0

    sent_token = sent_tokenize(text)
    word_token = [word_tokenize(sent) for sent in sent_token]
    word_pos_tag = [pos_tag(word) for word in word_token]

    for i in range(len(word_pos_tag)):
        for word, tag in (word_pos_tag[i]):
            wn_tag = get_wordnet_pos(tag)

            synsets = wn.synsets(word, pos=wn_tag)

            if not synsets:
                continue

            word_sent_score = 0
            synset_count = 0
            # Calculate the aveage sentiment score of first 5 synsets of given word
            for synset in synsets:
                synset_count += 1
                swn_synset = swn.senti_synset(synset.name())
                word_sent_score += swn_synset.pos_score() - swn_synset.neg_score()
                num_of_words += 1
                if(synset_count==5):
                    continue

            sentiment_score += word_sent_score
    if(num_of_words==0):
      normalized_score = 0
    else:
      normalized_score = round(sentiment_score/num_of_words, 3)

    # Classify reviews according to setiment score assigned
    # 1 : positive, 0 : neutral, -1 : negative
    if normalized_score > 0:
        final_score = 1
    elif normalized_score < 0:
        final_score = -1
    elif normalized_score == 0:
        final_score = 0

    return final_score


if __name__ == "__main__":
    processed_df
    processed_tweets = processed_df['clean_text']
    scores = []
    for tweet in processed_tweets:
        scores.append(sentimentAnalyser(tweet))
    processed_df['score'] = scores
    # tweet = '''next establish question use question mark liberally indicate question think face indicate not understand 
    # say question compose 3040 communication critical emoji discovery
    # '''
    # tweet_with_score = sentimentAnalyser(tweet)
    # print(tweet_with_score)
    processed_word_embedding_tweets = processed_df['replaced_words_by_word_embedding']
    word_embedding_scores = []
    for tweet in processed_word_embedding_tweets:
        word_embedding_scores.append(sentimentAnalyser(tweet))
    processed_df['word_embedding_scores'] = word_embedding_scores
    print("Executed Successfully")

Executed Successfully


In [36]:
processed_df.head()

Unnamed: 0,date,text,clean_text,replaced_words_by_word_embedding,score,word_embedding_scores
0,2020-03-09 13:49:46+00:00,One need World to make #MAGA .\nWe need #Paris...,one need world make maga need parisagreement m...,one want world bring jamal want parisagreement...,-1,1
1,2020-03-09 13:26:18+00:00,SLOW DOWN\nGlobal #CO2 emissions from the #pow...,slow global co2 emission power sector fell 2 l...,fast global co2 emission electricity market fe...,-1,1
2,2020-03-09 13:21:51+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...,undpclimate wednesday 11 march poland amp ugan...,1,1
3,2020-03-09 13:21:42+00:00,RT @UNDPClimate: This Wednesday 11 March Switz...,undpclimate wednesday 11 march switzerland amp...,undpclimate wednesday 11 march poland amp ugan...,1,1
4,2020-03-09 13:20:13+00:00,This Wednesday 11 March Switzerland &amp; Ghan...,wednesday 11 march switzerland amp ghana share...,wednesday 11 march poland amp uganda itc progr...,1,1


In [0]:
processed_df_out = open("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/processed_df.pickle","wb")
pickle.dump(processed_df, processed_df_out)

In [0]:
processed_df_in = open("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/processed_df.pickle", "rb")
processed_df = pickle.load(processed_df_in)

In [0]:
processed_df.to_csv("/content/drive/My Drive/dataScience/projects/twitterSentimentAnalysis/dataset/results.csv", index=False)

In [37]:
for n in range(0,100,20):
    print("date:", processed_df["date"][n] )
    print("original_tweet:", processed_df["text"][n])
    print("processed_tweet:", processed_df["clean_text"][n])
    print("replaced_centroids_tweet:", processed_df["replaced_words_by_word_embedding"][n])
    print("sentiment_score:", processed_df["score"][n])
    print("sentiment_score_with_embeddings:", processed_df["word_embedding_scores"][n],'\n')

date: 2020-03-09 13:49:46+00:00
original_tweet: One need World to make #MAGA .
We need #ParisAgreement to make Earth Great.
@UNEP @UN @IPCC_CH @SciNetUCS… https://t.co/BJhy83Oyaf
processed_tweet: one need world make maga need parisagreement make earth great unep un ipcc ch scinetucs
replaced_centroids_tweet: one want world bring jamal want parisagreement bring planet good unep schwarzenegger ipcc h scinetucs
sentiment_score: -1
sentiment_score_with_embeddings: 1 

date: 2020-03-09 11:36:09+00:00
original_tweet: RT @PinkPetro: "Millions of minds and souls insisted upon a single, solitary truth: the #ParisAgreement is historic, essential, and here to…
processed_tweet: pinkpetro million mind soul insist upon single solitary truth parisagreement historic essential
replaced_centroids_tweet: pinkpetro million actually ndc argue one another one indeed parisagreement historic crucial
sentiment_score: 1
sentiment_score_with_embeddings: -1 

date: 2020-03-09 09:15:46+00:00
original_tweet: RT @Ju

In [45]:
length = len(processed_df)
count = 0
for i in range(len(processed_df)):
    if(processed_df['score'][i] != processed_df['word_embedding_scores'][i]):
        count+=1
print("total tweets: {} same score: {} different score: {}".format(length, length - count, count))

total tweets: 3456 same score: 2307 different score: 1149
