# Building and Visualizing word frequencies

Focus on build_freqs() helper function and visualize dataset fed into it

In [None]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import numpy as np
from utils import process_tweet, build_freqs

# To avoid SSL-related error
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [None]:
nltk.download('stopwords')

In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = all_positive_tweets + all_negative_tweets

print('Number of tweets: ', len(tweets))

In [None]:
# Make labels array
labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))
labels.shape

In [None]:
# Word frequency dictionary. Create a dictionary with the word counts from each corpus
freqs = build_freqs(tweets, labels)

# check data type
print(f'type(freqs) = {type(freqs)}')

# check length of the dictionary
print(f'len(freqs) = {len(freqs)}')


In [None]:
print(freqs)

In [None]:
# Table of word counts, for some specific words. Assume each word is unique
keys = ['happi', 'merri', 'nice', 'good', 'bad', 'sad', 'mad', 'best', 'pretti',
        '❤', ':)', ':(', '😒', '😬', '😄', '😍', '♛',
        'song', 'idea', 'power', 'play', 'magnific']

data = []
for word in keys:
    pos = 0
    neg = 0

    # number of positive counts
    if (word, 1) in freqs:
        pos = freqs[(word, 1)]

    # retrieve number of negative counts
    if (word, 0) in freqs:
        neg = freqs[(word, 0)]

    data.append([word, pos, neg])

data


In [None]:
np.log([1, 2, 3])

In [None]:
np.log([x[1] + 1 for x in data]) 

In [None]:
# Create scatter, on log scale

fig, ax = plt.subplots(figsize=(8,8))

x = np.log([x[1] + 1 for x in data])
y = np.log([x[2] + 1 for x in data])

ax.scatter(x, y)

plt.xlabel('log positive count')
plt.ylabel('log negative count')
for i in range(0, len(data)):
    ax.annotate(data[i][0], (x[i], y[i]), fontsize=12)

ax.plot([0,9], [0,9], color='red')
plt.show()