In [1]:
import re
import math
from collections import defaultdict
import nltk                           
from nltk.corpus import twitter_samples 

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/mani/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
positive_tweets = all_positive_tweets
negative_tweets = all_negative_tweets

In [5]:
# positive_tweets = [
#     "I am happy because I am Learning NLP",
#     "I am not sad, I am happy"
# ]

# negative_tweets = [
#     "I am sad, I am not learning NLP",
#     "I am sad, I am not happy"
# ]

##### Preprocessing the tweets

In [6]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Removes punctuation
    tweet = re.sub(r'http\S+', '', tweet)  # Removes URLs
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)  # Removes short words
    return tweet.split()

In [7]:
# for tweet in positive_tweets:
#     print(preprocess_tweet(tweet))
# for tweet in negative_tweets:
#     print(preprocess_tweet(tweet))

In [8]:
def build_vocab_and_frequencies(positive_tweets, negative_tweets):
    vocab = set()
    word_freq_pos = defaultdict(int)
    word_freq_neg = defaultdict(int)

    for tweet in positive_tweets:
        tokens = preprocess_tweet(tweet)
        for token in tokens:
            vocab.add(token)
            word_freq_pos[token] += 1

    for tweet in negative_tweets:
        tokens = preprocess_tweet(tweet)
        for token in tokens:
            vocab.add(token)
            word_freq_neg[token] += 1

    return vocab, word_freq_pos, word_freq_neg

In [9]:
vocab, word_freq_pos, word_freq_neg = build_vocab_and_frequencies(positive_tweets, negative_tweets)

##### calculation of conditional probabilities with Laplacian smoothing

In [10]:
def calculate_conditional_probabilities(vocab, word_freq_pos, word_freq_neg):
    V = len(vocab)
    P_w_pos = {}
    P_w_neg = {}

    N_pos = sum(word_freq_pos.values())
    N_neg = sum(word_freq_neg.values())

    for word in vocab:
        P_w_pos[word] = (word_freq_pos[word] + 1) / (N_pos + V)
        P_w_neg[word] = (word_freq_neg[word] + 1) / (N_neg + V)

    return P_w_pos, P_w_neg

In [11]:
P_w_pos, P_w_neg = calculate_conditional_probabilities(vocab, word_freq_pos, word_freq_neg)

In [12]:
# P_w_pos

In [13]:
# P_w_neg

##### log likelihood ratio

In [14]:
def calculate_lambda(P_w_pos, P_w_neg):
    lambda_values = {}
    for word in P_w_pos.keys():
        lambda_values[word] = math.log(P_w_pos[word] / P_w_neg[word])
    return lambda_values

In [15]:
lambda_values = calculate_lambda(P_w_pos, P_w_neg)

In [16]:
# print(lambda_values)

In [17]:
def calculate_logprior(positive_tweets, negative_tweets):
    D_pos = len(positive_tweets)
    D_neg = len(negative_tweets)
    return math.log(D_pos / D_neg)

In [18]:
logprior = calculate_logprior(positive_tweets, negative_tweets)

In [19]:
print(logprior)

0.0


In [20]:
def sentiment_analysis(tweet, lambda_values, logprior):
    tokens = preprocess_tweet(tweet)
    sentiment_score = 0.0

    for token in tokens:
        if token in lambda_values:
            sentiment_score += lambda_values[token]

    sentiment_score += logprior

    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score < 0:
        return "Negative"
    else:
        return "Neutral"

In [None]:
test_tweet = input()
result = sentiment_analysis(test_tweet, lambda_values, logprior)
print(f"Sentiment: {result}")