In [10]:
import nltk
import string
import re
import numpy as np
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

In [11]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [14]:
print(len(all_positive_tweets))
print(len(all_negative_tweets))

5000
5000


In [30]:
def process_tweets(tweet):

  tweet_0 = re.sub(r'^RT[\s]+', '', tweet)
  tweet_0 = re.sub(r'https?://[^\s\n\r]+', '', tweet_0)
  tweet_0 = re.sub(r'#', '', tweet_0)

  tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)
  tokenize_tweet = tokenizer.tokenize(tweet_0)

  stopwards_english = stopwords.words('english')
  panctuations = string.punctuation

  clean_tweet = []

  for word in tokenize_tweet:
    if (word not in stopwards_english and
        word not in panctuations):

      clean_tweet.append(word)

  stem_tweets = []
  streammer = PorterStemmer()

  for word in clean_tweet:
    stem_word = streammer.stem(word)
    stem_tweets.append(stem_word)

  return stem_tweets

In [31]:
def build_freqs(tweet, ys):
  yslist = np.squeeze(ys).tolist()

  freqs = {}

  for y, tweet in zip(yslist, tweet):
    for word in process_tweets(tweet):
      pair = (word, y)

      if pair in freqs:
        freqs[pair]+= 1
      else:
        freqs[pair] = 1

  return freqs


In [32]:
tweet = all_positive_tweets + all_negative_tweets
labels = np.append(np.ones(len(all_positive_tweets)), np.zeros(len(all_negative_tweets)))

In [33]:
freqs_0 = build_freqs(tweet, labels)

In [34]:
freqs_0

{('followfriday', 1.0): 25,
 ('top', 1.0): 32,
 ('engag', 1.0): 7,
 ('member', 1.0): 16,
 ('commun', 1.0): 33,
 ('week', 1.0): 83,
 (':)', 1.0): 3691,
 ('hey', 1.0): 77,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 99,
 ('call', 1.0): 37,
 ('contact', 1.0): 7,
 ('centr', 1.0): 2,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 8,
 ('assist', 1.0): 1,
 ('mani', 1.0): 33,
 ('thank', 1.0): 643,
 ('listen', 1.0): 17,
 ('last', 1.0): 47,
 ('night', 1.0): 68,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 51,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 21,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 69,
 ('blue', 1.0): 9,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 6,
 ('profil', 1.0): 2,
 ('15', 1.0): 5,
 ('day', 1.0): 246,
 ('one', 1.0): 131,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 17,
 ('like', 1.0): 233,
 ('keep', 1.0): 68,
 ('love', 1.0): 401,


In [37]:
keys = ['happi', 'merri', 'nice', 'good', 'bad', 'sad', 'mad', 'best', 'pretti']
word_list = []

for word in keys:

  pos = 0
  neg = 0

  if (word, 1) in freqs_0:
    pos = freqs_0[(word, 1)]

  if (word, 0) in freqs_0:
    neg = freqs_0[(word, 0)]

  word_list.append([word, pos, neg])

word_list

[['happi', 212, 25],
 ['merri', 1, 0],
 ['nice', 99, 19],
 ['good', 238, 101],
 ['bad', 18, 73],
 ['sad', 5, 123],
 ['mad', 4, 11],
 ['best', 65, 22],
 ['pretti', 20, 15]]