In [1]:
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
import re
import numpy as np
import pandas as pd
import string
import nltk
from nltk.tokenize import TweetTokenizer

In [2]:
#getting the positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
#split data for training and testing
train_positive = positive_tweets[:4000]
test_positive = positive_tweets[4000:]

train_negative = negative_tweets[:4000]
test_negative = negative_tweets[4000:]

train_x = train_positive + train_negative
test_x = test_positive + test_negative

train_y = np.append(np.ones(len(train_positive)), np.zeros(len(train_negative)))
test_y = np.append(np.ones(len(test_positive)), np.zeros(len(test_negative)))

In [4]:
train_positive[0:10]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [5]:
train_negative[0:10]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(',
 '“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too',
 'Dang starting next week I have "work" :(',
 "oh god, my babies' faces :( https://t.co/9fcwGvaki0",
 '@RileyMcDonough make me smile :((',
 '@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln',
 'why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"',
 'Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz']

In [6]:
def clean_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    tweet = re.sub(r'\$\w*','',tweet)
    # remove retweet text "RT"
    tweet = re.sub(r'RT[\s]+','',tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
    #remove # sign
    tweet = re.sub(r'#','',tweet)
    #tokenizing the tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    cleaned_tweet=[]
    for word in tweet_tokens:
        if(word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            cleaned_tweet.append(stem_word)
    return cleaned_tweet
    

In [7]:
clean_tweet(train_negative[0])

['hopeless', 'tmr', ':(']

In [8]:
def count_tweets(result, tweets, sentiment):
    #result= preexisting dictionary
    #tweets = group of tweets
    #sentiment = group of tags of the tweets supplies
    for tag,tweet in zip(sentiment,tweets):
        cleaned_tweet = clean_tweet(tweet)
        for word in cleaned_tweet:
            pair = (word,tag)
            
            result[pair] = result.get(pair,0) + 1
            
#             if pair in result:
#                 result[pair] += 1
#             else:
#                 result[pair] += 1
    return result
    

In [9]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [10]:
#getting the frequencies of all the words
freqs = count_tweets({},train_x,train_y)

In [11]:
freqs

{('followfriday', 1.0): 23,
 ('top', 1.0): 30,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2847,
 ('hey', 1.0): 60,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 80,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 504,
 ('listen', 1.0): 14,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 57,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 ('15', 1.0): 4,
 ('day', 1.0): 187,
 ('one', 1.0): 90,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 16,
 ('like', 1.0): 187,
 ('keep', 1.0): 55,
 ('love', 1.0): 335,
 

In [12]:
def train_naive_bayes(freqs, train_x, train_y):
    
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    loglikelihood = {}
    # count N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1]>0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
    
    D = len(train_y)
    D_pos = sum(train_y)
    D_neg = D - D_pos

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word,1.0),0)
        freq_neg = freqs.get((word,0.0),0)
        p_w_pos = (freq_pos)+1/(N_pos+V)
        p_w_neg = (freq_neg)+1/(N_neg+V)
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)
    return logprior, loglikelihood

In [13]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

In [14]:
len(loglikelihood)

9091

In [15]:
loglikelihood

{'dose': -10.489494712712904,
 'mixtur': -10.489494712712904,
 '61': -10.489494712712904,
 '2ish': 10.494712888195956,
 '9-11': 10.494712888195956,
 'teamzip': -10.489494712712904,
 '9pm': 1.4482526596205106e-07,
 'robinhood': -10.489494712712904,
 'tama': -10.489494712712904,
 'less': 0.6931425908643907,
 'client_amends_edit': 10.494712888195956,
 'clock': -0.6931331944252911,
 'mypark': 11.593306625193023,
 'suppos': 0.4054628253267776,
 'morn': 1.0840126948661175,
 '😘': 1.6092093435560173e-08,
 'hind': -10.489494712712904,
 '.\n.': -11.182628051963462,
 '5878e503': 10.494712888195956,
 'shock': -1.0985936887210854,
 'mag': -11.182628051963462,
 'art': 0.2231421961288511,
 'rhi': 10.494712888195956,
 'quickest': -10.489494712712904,
 'milf': -10.489494712712904,
 'context': 1.4482526596205106e-07,
 'lit': -0.6931331944252911,
 'custom': 1.386273635291035,
 'hubba': -10.489494712712904,
 'swipe': -10.489494712712904,
 'bubbl': 1.0985938818223335,
 'canon': 1.4482526596205106e-07,
 "ev

In [18]:
def sentiment_predictor(tweet, logprior, loglikelihood):
    words_in_tweet = clean_tweet(tweet)
    
    p=logprior
    
    for word in words_in_tweet:
        if word in loglikelihood:
            p += loglikelihood[word]
    if p>1:
        sentiment='positive'
    else:
        sentiment='negative'
    return sentiment

In [21]:
my_tweet = 'I am very happy today'
p = sentiment_predictor(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is positive
