In [None]:
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import pandas as pd
import numpy as np
nltk.download('stopwords')

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1  
    return freqs

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
process_tweet('hello, ')

['hello', 'buddi']

In [None]:
data = ['I love the Movie','I hated the movie','A great movie, a good movie','Poor Acting','Great Acting, a good movie']
label = [1,0,1,0,1]

# Training Data and finding solution

In [None]:
freq = build_freqs(data,label)
freq

{('love', 1): 1,
 ('movi', 1): 4,
 ('hate', 0): 1,
 ('movi', 0): 1,
 ('great', 1): 2,
 ('good', 1): 2,
 ('poor', 0): 1,
 ('act', 0): 1,
 ('act', 1): 1}

In [None]:
vocab = {}
n_pos = 0
n_neg = 0
count_pos = 0
count_neg = 0
for key, val in freq:
  vocab[key] = 1
  if val == 0:
    n_neg += freq[(key,val)]
  else:
    n_pos += freq[(key,val)]

for x in label:
  if x == 0:
    count_neg += 1
  else:
    count_pos += 1

vocabulary = len(vocab)
n_pos, n_neg, count_pos, count_neg, vocabulary

(10, 4, 3, 2, 7)

In [None]:
Test = 'I love the movie'

In [None]:
Prob_pos = count_pos/len(label)
Prob_neg = count_neg/len(label)

testing = process_tweet(Test)
testing

for x in testing:
  Prob_pos = Prob_pos*((1+freq.get((x,1),0))/(n_pos+vocabulary))
  Prob_neg = Prob_neg*((1+freq.get((x,0),0))/(n_neg+vocabulary))
  print(Prob_pos,Prob_neg)

print("")
if Prob_pos > Prob_neg:
  print(f'"{Test}" :is a Positive sentance')
else:
  print(f'"{Test}" :is a negative sentance')

0.07058823529411765 0.03636363636363637
0.020761245674740483 0.006611570247933885

"I love the movie" :is a Positive sentance


# Tweet Preprocessing

In [None]:
import nltk
from nltk.corpus import twitter_samples

In [None]:
nltk.download('twitter_samples')
pos_tweet = twitter_samples.strings('positive_tweets.json')
neg_tweet = twitter_samples.strings('negative_tweets.json')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [None]:
pos_tweet[:5]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

In [None]:
import pandas as pd
import numpy as np
labels = np.append(np.ones(len(pos_tweet[:100])),np.zeros(len(neg_tweet[:100])))

In [None]:
tweet = pos_tweet[:100] + neg_tweet[:100]

In [None]:
freq = build_freqs(tweet,labels)
freq

In [None]:
vocab = {}
n_pos = 0
n_neg = 0
count_pos = 0
count_neg = 0
for key, val in freq:
  vocab[key] = 1
  if val == 0:
    n_neg += freq[(key,val)]
  else:
    n_pos += freq[(key,val)]

for x in labels:
  if x == 0:
    count_neg += 1
  else:
    count_pos += 1

vocabulary = len(vocab)
n_pos, n_neg, count_pos, count_neg, vocabulary

(672, 589, 100, 100, 667)

In [None]:
Test = neg_tweet[105]

In [None]:
Prob_pos = count_pos/len(label)
Prob_neg = count_neg/len(label)

testing = process_tweet(Test)
testing

for x in testing:
  Prob_pos = Prob_pos*((1+freq.get((x,1),0))/(n_pos+vocabulary))
  Prob_neg = Prob_neg*((1+freq.get((x,0),0))/(n_neg+vocabulary))
  print(Prob_pos,Prob_neg)

print("")
if Prob_pos > Prob_neg:
  print(f'"{Test}" :is a Positive sentance')
else:
  print(f'"{Test}" :is a negative sentance')

0.014936519790888725 0.01592356687898089
1.1154981173180527e-05 0.001026917927704978
2.4992489559030304e-08 8.176098150517341e-07
7.466016298440718e-11 6.509632285443743e-10
1.6727445030113633e-13 1.0365656505483668e-12
1.2492490687164776e-16 2.4758733691441884e-15
9.329716719316488e-20 3.942473517745523e-18

"@subharrie ohh no :(( and yeah i hope she comes back soon too" :is a negative sentance
