In [None]:
#import libraries
import os
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
from wordcloud import WordCloud

import spacy
from html import unescape
from emoji import UNICODE_EMOJI
from sklearn.feature_extraction.text import CountVectorizer

import pickle
import scipy.sparse as sparse

In [None]:
SENTIMENT140_DATA_DIR = 'Sentiment140.data' # sentiment 140 data set saved here
DG_DATA_DIR = 'D_G data' # D&G data set saved here
OUTPUT_DIR = 'output' # intermediate output and models saved here
FIGURES_DIR = 'figures' # figures saved here

# Read in data with predictions

In [None]:
df_chopsticks = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_chopsticks_mentions_daily_expanded_with_predictions.csv"))
df_all = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_mentions_daily_all_with_predictions.csv"), lineterminator='\n')

# Create D&G count matrices using Count Vectorizer

In [None]:
# load NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# helper function for pre-processing/cleaning a tweet
def preprocessor(tweet):
    tweet = re.sub (r'@[A-Za-z0-9_]+', '_AT_USER_', tweet) # replace @X with _AT_USER_
    tweet = re.sub (r'#[A-Za-z0-9_]+', '_HASHTAG_', tweet) # replace #X with _HASTHAG_
    tweet = re.sub (r'^RT[\s]+', '', tweet) # remove RT (retweet) at the start of the tweet
    tweet = unescape(tweet) # unescape the HTML
    tweet = tweet.lower() # make everything lowercase
    return tweet

# helper function for tokenization of a tweet
def tokenizer(tweet):
    tokens = nlp(tweet) # this processes the tweet text  
    # only keep tokens (lemmatized) that are alphanumeric (including "-" and "_") and not a stop word, or represent an emoji
    tokens = [t.lemma_ for t in tokens if (re.match("^[a-zA-Z0-9_-]*$", t.text) and not t.is_stop and len(t.text) > 2) or t.text in UNICODE_EMOJI]
    return tokens

In [None]:
corpus_chopsticks = list(df_chopsticks['text']) # a list of tweets
corpus_all = list(df_all['text']) # a list of tweets

In [None]:
# run count vectorizer on D&G chopsticks data set
model_chopsticks = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, max_features=2000)
word_counts_chopsticks = model_chopsticks.fit_transform(corpus_chopsticks)
tokens_chopsticks = model_chopsticks.get_feature_names()

In [None]:
# save count matrices and tokens/vocab
sparse.save_npz(os.path.join(OUTPUT_DIR, 'dg_chopsticks_count_vectorizer_output.npz'), word_counts_chopsticks, compressed=True)
np.save(os.path.join(OUTPUT_DIR, 'tokens_chopsticks.npy'), tokens_chopsticks)

In [None]:
# run count vectorizer on D&G all data set
model_all = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, max_features=2000)
word_counts_all = model_all.fit_transform(corpus_all)
tokens_all = model_all.get_feature_names()

In [None]:
# save count matrices and tokens/vocab
sparse.save_npz(os.path.join(OUTPUT_DIR, 'dg_all_count_vectorizer_output.npz'), word_counts_all, compressed=True)
np.save(os.path.join(OUTPUT_DIR, 'tokens_all.npy'), tokens_all)

# Load D&G count matrices

In [None]:
word_counts_chopsticks = sparse.load_npz(os.path.join(OUTPUT_DIR, 'dg_chopsticks_count_vectorizer_output.npz'))
tokens_chopsticks = np.load(os.path.join(OUTPUT_DIR, 'tokens_chopsticks.npy'), allow_pickle=True)

word_counts_all = sparse.load_npz(os.path.join(OUTPUT_DIR, 'dg_all_count_vectorizer_output.npz'))
tokens_all = np.load(os.path.join(OUTPUT_DIR, 'tokens_all.npy'), allow_pickle=True)

# Separate into Positive and Negative Datasets

In [None]:
# split into positive and negative dataframes using ensemble of classifiers (svc, rfc, lr, nb)
df_all_pos = df_all[df_all['pred'] == 1]
df_all_neg = df_all[df_all['pred'] == -1]

df_chopsticks_pos = df_chopsticks[df_chopsticks['pred'] == 1]
df_chopsticks_neg = df_chopsticks[df_chopsticks['pred'] == -1]

# find the index of the positive and negative tweets
df_all_pos_ind = np.array(df_all_pos.index)
df_all_neg_ind = np.array(df_all_neg.index)

df_chopsticks_pos_ind = np.array(df_chopsticks_pos.index)
df_chopsticks_neg_ind = np.array(df_chopsticks_neg.index)

# Get positive and negative word frequencies for D&G all data set

In [None]:
# get the total frequency of each word by summing counts across all tweets for a given word
total_freq_all = word_counts_all.sum(axis=0)

# use the indices to segment the sparse matrix of filtered word counts 
pos_word_counts_all = word_counts_all[df_all_pos_ind,:]
neg_word_counts_all = word_counts_all[df_all_neg_ind,:]

# the positive frequency of a word is its total count in positive tweets subtracted by its total count in negative tweets
pos_freq_all = (pos_word_counts_all.sum(axis=0) - neg_word_counts_all.sum(axis=0)).tolist()[0]
# the negative frequency of a word is its total count in negative tweets subtracted by its totla count in positive tweets
neg_freq_all = (neg_word_counts_all.sum(axis=0) - pos_word_counts_all.sum(axis=0)).tolist()[0]

In [None]:
# create dictionaries where key is the word and value is its frequency (negative or positive)
pos_freq_list = dict(zip(tokens_all, pos_freq_all))
neg_freq_list = dict(zip(tokens_all, neg_freq_all))

# sort based on the frequencies
pos_sorted = sorted(pos_freq_list, key = lambda x: x[1], reverse=True)
neg_sorted = sorted(neg_freq_list, key = lambda x: x[1], reverse=True)

In [None]:
num_words_to_filter = 200 # number of words to plot in each word cloud

# get the top 200 words based on positive and negative frequency
top_pos_words = [i[0] for i in pos_sorted][:num_words_to_filter] 
top_neg_words = [i[0] for i in neg_sorted][:num_words_to_filter] 

# check if there are overlapping words in top_pos_words and top_neg_words
same_words = set(set(top_pos_words).intersection(set(top_neg_words)))
print(len(same_words), 'words overlap between top neg and pos words', num_words_to_filter)

In [None]:
## plot word clouds

# positive wordcloud
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize = (20, 16))

# positive words w overlap
wordcloud = WordCloud(width = 400, 
                      height = 300, 
                      random_state = 42,
                      max_words = 200,
                      background_color = 'black')
wordcloud.generate_from_frequencies(pos_freq_list) # generate from CountVectorizer frequencies
ax1.imshow(wordcloud)
ax1.set_title('Positive words', fontsize = 20)
ax1.axis("off")
ax1.text(0, 0, 'A)', fontsize=30)

# negative w overlap
wordcloud = WordCloud(width = 400, 
                      height = 300, 
                      random_state = 42,
                      max_words = 200,
                      background_color = 'black')
wordcloud.generate_from_frequencies(neg_freq_list) # generate from CountVectorizer frequencies
ax2.imshow(wordcloud)
ax2.set_title('Negative words', fontsize = 20)
ax2.text(0, 0, 'B)', fontsize=30)
ax2.axis("off")

plt.tight_layout(pad=2)
#plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.1, hspace=None)

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_all_wordcloud_pos_vs_neg.jpg'), dpi=300)

plt.show()

In [None]:
# list out the top positive words and their frequencies
top_k = 25
pos_freq_list_sorted = sorted(pos_freq_list.items(), key=lambda x: x[1], reverse=True)
for i, tup in enumerate(pos_freq_list_sorted):
    if i == top_k:
        break

    print(f'{i+1}. {tup[0]} ({tup[1]})')

In [None]:
# list out the top negative words and their frequencies
neg_freq_list_sorted = sorted(neg_freq_list.items(), key=lambda x: x[1], reverse=True)
for i, tup in enumerate(neg_freq_list_sorted):
    if i == top_k:
        break
        
    print(f'{i+1}. {tup[0]} ({tup[1]})')

# Get positive and negative word frequencies for D&G chopsticks data set

In [None]:
# get the total frequency of each word by summing counts across all tweets for a given word
total_freq_chopsticks = word_counts_chopsticks.sum(axis=0)

# use the indices to segment the sparse matrix of filtered word counts 
pos_word_counts_chopsticks = word_counts_chopsticks[df_chopsticks_pos_ind,:]
neg_word_counts_chopsticks = word_counts_chopsticks[df_chopsticks_neg_ind,:]

# the positive frequency of a word is its total count in positive tweets subtracted by its total count in negative tweets
pos_freq_chopsticks = (pos_word_counts_chopsticks.sum(axis=0) - neg_word_counts_chopsticks.sum(axis=0)).tolist()[0]
# the negative frequency of a word is its total count in negative tweets subtracted by its totla count in positive tweets
neg_freq_chopsticks = (neg_word_counts_chopsticks.sum(axis=0) - pos_word_counts_chopsticks.sum(axis=0)).tolist()[0]

In [None]:
# create dictionaries where key is the word and value is its frequency (negative or positive)
pos_freq_list = dict(zip(tokens_chopsticks, pos_freq_chopsticks))
neg_freq_list = dict(zip(tokens_chopsticks, neg_freq_chopsticks))

# sort based on the frequencies
pos_sorted = sorted(pos_freq_list, key = lambda x: x[1], reverse=True)
neg_sorted = sorted(neg_freq_list, key = lambda x: x[1], reverse=True)

In [None]:
num_words_to_filter = 200 # number of words to plot in each word cloud

# get the top 200 words based on positive and negative frequency
top_pos_words = [i[0] for i in pos_sorted][:num_words_to_filter] 
top_neg_words = [i[0] for i in neg_sorted][:num_words_to_filter] 

# check if there are overlapping words in top_pos_words and top_neg_words
same_words = set(set(top_pos_words).intersection(set(top_neg_words)))
print(len(same_words), 'words overlap between top neg and pos words', num_words_to_filter)

In [None]:
## plot word clouds

# positive wordcloud
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize = (20, 16))

# positive words w overlap
wordcloud = WordCloud(width = 400, 
                      height = 300, 
                      random_state = 42,
                      max_words = 200,
                      background_color = 'black')
wordcloud.generate_from_frequencies(pos_freq_list) # generate from CountVectorizer frequencies
ax1.imshow(wordcloud)
ax1.set_title('Positive words', fontsize = 20)
ax1.axis("off")
ax1.text(0, 0, 'A)', fontsize=30)

# negative w overlap
wordcloud = WordCloud(width = 400, 
                      height = 300, 
                      random_state = 42,
                      max_words = 200,
                      background_color = 'black')
wordcloud.generate_from_frequencies(neg_freq_list) # generate from CountVectorizer frequencies
ax2.imshow(wordcloud)
ax2.set_title('Negative words', fontsize = 20)
ax2.text(0, 0, 'B)', fontsize=30)
ax2.axis("off")

plt.tight_layout(pad=2)
#plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.1, hspace=None)

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_chopsticks_wordcloud_pos_vs_neg.jpg'), dpi=300)

plt.show()

In [None]:
# list out the top positive words and their frequencies
top_k = 25
pos_freq_list_sorted = sorted(pos_freq_list.items(), key=lambda x: x[1], reverse=True)
for i, tup in enumerate(pos_freq_list_sorted):
    if i == top_k:
        break

    print(f'{i+1}. {tup[0]} ({tup[1]})')

In [None]:
# list out the top negative words and their frequencies
neg_freq_list_sorted = sorted(neg_freq_list.items(), key=lambda x: x[1], reverse=True)
for i, tup in enumerate(neg_freq_list_sorted):
    if i == top_k:
        break
        
    print(f'{i+1}. {tup[0]} ({tup[1]})')