In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
from pprint import pprint
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

from collections import Counter
from wordcloud import WordCloud


import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline



In [None]:
# import pre-processed data
df = pd.read_pickle('path/to/pre-processed.pkl')
df = df[[ 'Tweet', 'tokens_no_stop']]#['Authors',

In [None]:
# Pre-Processing
# Remove links
def remove_links(tweet):
    tweet_no_link = re.sub(r"http\S+", "", tweet)
    return tweet_no_link
df['tweet_text_p'] = np.vectorize(remove_links)(df['Tweet'])
def remove_links(tweet):
    tweet_no_link = re.sub(r"twitter.com\S+", "", tweet)
    return tweet_no_link
df['tweet_text_p'] = np.vectorize(remove_links)(df['tweet_text_p'])

In [None]:
# Remove Twitter Handlers (@Users)
def remove_users(tweet, pattern1, pattern2):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    r = re.findall(pattern2, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet
df['tweet_text_p'] = np.vectorize(remove_users)(df['tweet_text_p'], "@ [\w]*", "@[\w]*")

In [None]:
# Remove Hashtag Symbol
# We decided to keep hashtags because they add value to the sentiment.
def remove_hashtags(tweet, pattern1):
    r = re.findall(pattern1, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet
df['tweet_text_p'] = np.vectorize(remove_hashtags)(df['tweet_text_p'], "#")

In [None]:
# Do not Remove Punctuation

# Removing Punctuation has no significant impact in most cases 
# In some cases reduces the neutrality because of words in parentheses
# df['pre_processed_1'] = df['pre_processed'].str.replace("[^a-zA-Z#]", " ")

In [None]:
# Remove Duplicates
df.drop_duplicates(subset=['tweet_text_p'], keep='first', inplace=True)
df = df[~df.tweet_text_p.str.contains("Retweeted")]
df.shape

In [None]:
# Create an object of Vader Sentiment Analyzer
vader_analyzer = SentimentIntensityAnalyzer()

In [None]:
negative = []
neutral = []
positive = []
compound = []

In [None]:
def sentiment_scores(df, negative, neutral, positive, compound):
    for i in df['tweet_text_p']:
        sentiment_dict = vader_analyzer.polarity_scores(i)
        negative.append(sentiment_dict['neg'])
        neutral.append(sentiment_dict['neu'])
        positive.append(sentiment_dict['pos'])
        compound.append(sentiment_dict['compound'])

In [None]:
# Function calling 
sentiment_scores(df, negative, neutral, positive, compound)

In [None]:
# Prepare columns to add the scores later
df["negative"] = negative
df["neutral"] = neutral
df["positive"] = positive
df["compound"] = compound

In [None]:
# Fill the overall sentiment with encoding:
# (-1)Negative, (0)Neutral, (1)Positive
sentiment = []
for i in df['compound']:
    if i >= 0.05 : 
        sentiment.append(1)
  
    elif i <= - 0.05 : 
        sentiment.append(-1) 
        
    else : 
        sentiment.append(0)
df['sentiment'] = sentiment
neg_tweets = df.sentiment.value_counts()[-1]
neu_tweets = df.sentiment.value_counts()[0]
pos_tweets = df.sentiment.value_counts()[1]

In [None]:
# save sentiment df
df.to_pickle('path/to/sentiment.pkl')

In [None]:
# Number of tweets by sentiment
# Pie Chart

# Draw Plot
fig, ax = plt.subplots(figsize=(10, 6), subplot_kw=dict(aspect="equal"), dpi= 80)

data = [df.sentiment.value_counts()[-1], df.sentiment.value_counts()[0], df.sentiment.value_counts()[1]]
categories = ['Negative', 'Neutral', 'Positive']
explode = [0.05,0.05,0.05]

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}% ({:d} )".format(pct, absolute)

wedges, texts, autotexts = ax.pie(data, 
                                  autopct=lambda pct: func(pct, data),
                                  textprops=dict(color="w"), 
                                  colors=['#e55039', '#3c6382', '#78e08f'],
                                  startangle=140,
                                  explode=explode)

# Decoration
ax.legend(wedges, categories, title="Sentiment", loc="center left", bbox_to_anchor=(1, 0.2, 0.5, 1))
plt.setp(autotexts, size=10, weight=700)
ax.set_title("Number of Tweets by Sentiment", fontsize=12, fontweight="bold")
plt.show()

In [None]:
# Histogram
labels = ['Negative', 'Neutral', 'Positive']
freq = [df.sentiment.value_counts()[-1], df.sentiment.value_counts()[0], df.sentiment.value_counts()[1]]
index = np.arange(len(freq))

plt.figure(figsize=(8,6))
plt.bar(index, freq, alpha=0.8, color= 'black')
plt.xlabel('Sentiment', fontsize=13)
plt.ylabel('Number of Tweets', fontsize=13)
plt.xticks(index, labels, fontsize=11, fontweight="bold") 
plt.title('Number of Tweets per Sentiment', fontsize=12, fontweight="bold")
plt.ylim(0, len(df['Tweet']))
plt.show()

In [None]:
# Density Plot of Overall compound score

# We remove the neutral compound scores to compare the negative and positive tweets
data = df[(df["sentiment"]!=0)]
# Draw Plot
plt.figure(figsize=(8,6), dpi= 80)
sns.kdeplot(data["compound"], shade=True, color="#3c6382", label="Overall Compound Score", alpha=.7)

# Decoration
plt.title('Density Plot of Overall Compound Score', fontsize=11, fontweight='bold')
plt.axvline(x=0, color='#e55039')
plt.legend()
plt.show()

In [None]:
# Word Cloud of Positive and Negative Tweets
# Lemmatization

def lemmatization(tweets, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    tweets_out = []
    for sent in tweets:
        doc = nlp(" ".join(sent)) 
        tweets_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return tweets_out

    
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)

nlp = spacy.load("en_core_web_sm")
# Do lemmatization keeping only noun, adj, vb, adv
df['lemmatized'] = lemmatization(df['tokens_no_stop'], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
df.drop(['tokens_no_stop'], axis=1, inplace=True)
df_pos = df[df['sentiment']==1]
df_neg = df[df['sentiment']==(-1)]

In [None]:
# Word Frequency

# Join the tweet back together
def rejoin_words(row):
    words = row['lemmatized']
    joined_words = (" ".join(words))
    return joined_words
    
df_neg['no_stop_joined'] = df_neg.apply(rejoin_words, axis=1)


In [None]:
word_freq = pd.Series(np.concatenate([x.split() for x in df_neg.no_stop_joined])).value_counts()
word_df = pd.Series.to_frame(word_freq)
word_df['word'] = list(word_df.index)
word_df.reset_index(drop=True, inplace=True)
word_df.columns = ['freq', 'word']
word_df.drop([ 0, 1], inplace=True)

In [None]:
label = word_df['word'].head(20)
freq = word_df['freq'].head(20)
index = np.arange(len(freq))

print("Unique words:", len(word_df))
plt.figure(figsize=(12,9))
plt.bar(index, freq, alpha=0.8, color= 'black')
plt.xlabel('Words', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.xticks(index, label, fontsize=11, rotation=90, fontweight="bold") 
plt.title('Top 20 Words of Negative tweets after preprocessing', fontsize=12, fontweight="bold")
plt.show()

In [None]:
# Positive Tweets
# Join the tweet back together
def rejoin_words(row):
    words = row['lemmatized']
    joined_words = (" ".join(words))
    return joined_words

    
df_pos['no_stop_joined'] = df_pos.apply(rejoin_words, axis=1)

In [None]:
word_freq_pos = pd.Series(np.concatenate([x.split() for x in df_pos.no_stop_joined])).value_counts()
word_df_pos = pd.Series.to_frame(word_freq_pos)
word_df_pos['word'] = list(word_df_pos.index)
word_df_pos.reset_index(drop=True, inplace=True)
word_df_pos.columns = ['freq', 'word']
word_df_pos.drop([0,2], inplace=True)

In [None]:
label = word_df_pos['word'].head(20)
freq = word_df_pos['freq'].head(20)
index = np.arange(len(freq))

print("Unique words:", len(word_df_pos))
plt.figure(figsize=(12,9))
plt.bar(index, freq, alpha=0.8, color= 'black')
plt.xlabel('Words', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.xticks(index, label, fontsize=11, rotation=90, fontweight="bold") 
plt.title('Top 20 Words of Positive tweets after preprocessing', fontsize=12, fontweight="bold")
plt.show()

In [None]:
# WordCloud

# Join the tweet back together
def rejoin_words(row):
    words = row['lemmatized']
    joined_words = (" ".join(words))
    return joined_words

all_words_pos = ' '.join([text for text in df_pos.apply(rejoin_words, axis=1)])
all_words_neg = ' '.join([text for text in df_neg.apply(rejoin_words, axis=1)])


In [None]:
wordcloud = WordCloud(width=900, height=600, random_state=21, max_font_size=110, background_color='black', 
                      max_words=50,colormap='summer').generate(all_words_pos)
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)

plt.title('WordCloud of Positive Tweets', fontsize=14, fontweight="bold")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')

wordcloud = WordCloud(width=900, height=600, random_state=21, max_font_size=110, background_color='black', 
                      max_words=50,colormap='autumn').generate(all_words_neg)
plt.subplot(1, 2, 2)

plt.title('WordCloud of Negative Tweets', fontsize=14, fontweight="bold")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
wordcloud = WordCloud(width=900, height=600, random_state=21, max_font_size=110, background_color='grey', 
                      max_words=50,colormap='autumn').generate(all_words_neg)

plt.figure(figsize=(12, 8))
plt.title('WordCloud of Negative Tweets', fontsize=14, fontweight="bold")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Compound Score Distribution

x1 = df_pos['compound']
x2 = df_neg['compound']
plt.figure(figsize=(15,6))
plt.suptitle('Compound Score Distribution', fontsize=14, fontweight="bold")
plt.subplot(1,2,1)

sns.distplot(x1, color="g", bins=12, hist_kws={"alpha": 0.5,"rwidth":0.8})
plt.title('Positive Tweets',fontsize=12, fontweight="bold")
plt.xlabel("Compound Score", fontsize=12)
plt.xlim([0,1])

# Chart 2: Derivative Function
plt.subplot(1,2,2)
sns.distplot(x2, color="r", bins=15, hist_kws={"alpha": 0.5, "rwidth":0.8})
plt.title("Negative Tweets",fontsize=12, fontweight="bold")
plt.xlabel("Compound Score", fontsize=12)
plt.xlim([-1,0])

plt.show()


In [None]:
# 10 Most Positive and Negative Tweets
# 10 Most positive Tweets
df_pos.sort_values('compound', inplace=True, ascending=False)
df_pos.reset_index(drop=True, inplace=True)
df_pos.head(10)

In [None]:
# 10 Most Negative Tweets
df_neg.sort_values('compound', inplace=True)
df_neg.reset_index(drop=True, inplace=True)

df_neg.head(10)

In [None]:
df_neg.to_csv("path/to/df_neg.csv", sep=';')
df_pos.to_csv("path/to/df_pos.csv", sep=';')