<a href="https://colab.research.google.com/github/pavinduLakshan/nlp-bigram-sentiment-analysis/blob/master/NLP_Bigram_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Environment setup and data loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import nltk

negative_comments_df = pd.read_csv("/content/drive/My Drive/nlp/dataset/negative_comments_dataset.csv")
positive_comments_df = pd.read_csv("/content/drive/My Drive/nlp/dataset/positive_comments_dataset.csv")

In [None]:
def tokenizeSentence(row):
    tokenizer = nltk.RegexpTokenizer(r"<s>|<\/s>|\w+")
    comment = '<s> '+row["comment"] + ' </s>'
    print(comment)
    lowerCaseComment = comment.lower()
    new_words = tokenizer.tokenize(lowerCaseComment)
    return [str(word) for word in new_words]

In [None]:
negative_comments_df["comment"] = negative_comments_df.apply(tokenizeSentence, axis = 1)
positive_comments_df["comment"] = positive_comments_df.apply(tokenizeSentence, axis = 1);

<s> Mokada yako sinhalen liyanne israilaye inna eun sinhala dannwadha anunge ledawal asse atha dannathiwa wadak naththam kuburakwath kotapiyau </s>
<s> Wena ratawala prasna eun bera gani ape rate minissu beraganna balanna vesel vetharane balti nane dan jvp </s>
<s> Ambulance buyyek ehema dakke nadda? </s>
<s> Oya anunta kade yana unwa koheta hari yawanna bada boru kiyanawata hodak sedda wanna epa </s>
<s> muta kawuda acharya pattama dunne </s>
<s> Oya 5 nhe ne. pls Conta wela inna </s>
<s> Ubalath ehema ne giya aanduwen monawa hari wadak patan gatthama ohoma ne ahuwe.. </s>
<s> me kiyana widihata highway eka hadala thiyenne deshapaluwone..minissunge badu salli walin newei wage </s>
<s> devi mathiniya obatumiya ratnapure maruna korona marana 5ta mokada une? </s>
<s> Kalakanni. un Hariyata gewal asseth mask dagena innawa wage kathawa. pa#£@# </s>
<s> Munge puduma loku kamak thiyenne.. me welawe kochchara minissu duk widinawada covid nisa .. parawal hadana eka hodai eth me welawe karanna 

In [None]:
positive_train = positive_comments_df.sample(frac=0.8,random_state=200).drop(["0"],axis=1)
positive_test = positive_comments_df.sample(frac=0.2,random_state=200).drop(["0"],axis=1)
positive_train = positive_train.reset_index()
positive_test = positive_test.reset_index();

In [None]:
negative_train =negative_comments_df.sample(frac=0.8,random_state=200).drop(["0"],axis=1)
negative_test = negative_comments_df.sample(frac=0.2,random_state=200).drop(["0"],axis=1)
negative_train = negative_train.reset_index()
negative_test = negative_test.reset_index();

## Calculate Unigram and Bigram Frequencies

### Positive Unigram Frequency

In [None]:
positive_train_unigram_freq = {}

def countPositiveUnigramFrequency(row):
  fdist = nltk.FreqDist(row["comment"])
  for k, v in fdist.items():
    if k in positive_train_unigram_freq.keys():
      positive_train_unigram_freq[k] += v
    else:
	    positive_train_unigram_freq[k] = v

In [None]:
positive_train.apply(countPositiveUnigramFrequency, axis =1);

### Positive Bigram Frequency

In [None]:
positive_train_bigram_freq = {}

#Count bigram frequencies in positive and negative training datasets
def countPositiveBigramFrequency(row):
  bgs = nltk.bigrams(row["comment"])
  #compute frequency distribution for all the bigrams in the text
  fdist = nltk.FreqDist(bgs)
  for k, v in fdist.items():
    if k in positive_train_bigram_freq.keys():
      positive_train_bigram_freq[k] += v
    else:
	    positive_train_bigram_freq[k] = v

In [None]:
positive_train.apply(countPositiveBigramFrequency, axis = 1);

### Negative Unigram Frequency

In [None]:
negative_train_unigram_freq = {}

def countNegativeUnigramFrequency(row):
  fdist = nltk.FreqDist(row["comment"])
  for k, v in fdist.items():
    if k in negative_train_unigram_freq.keys():
      negative_train_unigram_freq[k] += v
    else:
	    negative_train_unigram_freq[k] = v

In [None]:
negative_train.apply(countNegativeUnigramFrequency, axis = 1);

### Negative Bigram Frequency

In [None]:
negative_train_bigram_freq = {}

#Count bigram frequencies in positive and negative training datasets
def countNegativeBigramFrequency(row):
  bgs = nltk.bigrams(row["comment"])
  #compute frequency distribution for all the bigrams in the text
  fdist = nltk.FreqDist(bgs)
  for k, v in fdist.items():
    if k in negative_train_bigram_freq.keys():
      negative_train_bigram_freq[k] += v
    else:
	    negative_train_bigram_freq[k] = v

In [None]:
negative_train.apply(countNegativeBigramFrequency, axis = 1);

## Prediction

In [None]:
# Calculate probability of being positive
def calculatePositiveProbability(comment):
  prob = 1
  V = sum(positive_train_unigram_freq.values()) - len(positive_train)

  bgs = nltk.bigrams(comment)

  for bigram in list(bgs):
    unigram_count = 0
    bigram_count = 0

    prob_item = 0

    if bigram in positive_train_bigram_freq.keys():
      bigram_count = positive_train_bigram_freq[bigram]
    if bigram[0] in positive_train_unigram_freq.keys():
      unigram_count = positive_train_unigram_freq[bigram[0]]
 
    prob_item = (bigram_count + 1) / (unigram_count + V)
    prob = prob * prob_item
    
  return prob

In [None]:
# Calculate probability of being negative
def calculateNegativeProbability(comment):
  prob = 1
  V = sum(negative_train_unigram_freq.values()) - len(negative_train)

  bgs = nltk.bigrams(comment)

  for bigram in list(bgs):
    unigram_count = 0
    bigram_count = 0

    prob_item = 0

    if bigram in negative_train_bigram_freq.keys():
      bigram_count = negative_train_bigram_freq[bigram]
    if bigram[0] in negative_train_unigram_freq.keys():
      unigram_count = negative_train_unigram_freq[bigram[0]]
 
    prob_item = (bigram_count + 1) / (unigram_count + V)
    prob = prob * prob_item

  return prob

In [None]:
def getPrediction(row):
  positive_prob = calculatePositiveProbability(row["comment"])
  negative_prob = calculateNegativeProbability(row["comment"])
  return "negative" if negative_prob > positive_prob else "positive"

In [None]:
positive_test["predicted"] = positive_test.apply(getPrediction,axis=1)

In [None]:
negative_test["predicted"] = negative_test.apply(getPrediction,axis=1);

## Model Evaluation using Perplexity

In [None]:
# Calculate unigram frequency in positive test dataset
positive_test_unigram_freq = {}

def countPositiveTestUnigramFrequency(row):
  fdist = nltk.FreqDist(row["comment"])
  for k, v in fdist.items():
    if k in positive_test_unigram_freq.keys():
      positive_test_unigram_freq[k] += v
    else:
	    positive_test_unigram_freq[k] = v

positive_test.apply(countPositiveTestUnigramFrequency,axis=1);

In [None]:
positive_test_bigram_freq = {}

#Count bigram frequencies in positive and negative training datasets
def countPositiveTestBigramFrequency(row):
  bgs = nltk.bigrams(row["comment"])
  #compute frequency distribution for all the bigrams in the text
  fdist = nltk.FreqDist(bgs)
  for k, v in fdist.items():
    if k in positive_test_bigram_freq.keys():
      positive_test_bigram_freq[k] += v
    else:
	    positive_test_bigram_freq[k] = v

positive_test.apply(countPositiveTestBigramFrequency,axis=1);

In [None]:
# Calculate unigram frequency in negative test dataset
negative_test_unigram_freq = {}

def countNegativeTestUnigramFrequency(row):
  fdist = nltk.FreqDist(row["comment"])
  for k, v in fdist.items():
    if k in negative_test_unigram_freq.keys():
      negative_test_unigram_freq[k] += v
    else:
	    negative_test_unigram_freq[k] = v

negative_test.apply(countNegativeTestUnigramFrequency,axis=1);

In [None]:
negative_test_bigram_freq = {}

#Count bigram frequencies in negative test datasets
def countNegativeTestBigramFrequency(row):
  bgs = nltk.bigrams(row["comment"])
  #compute frequency distribution for all the bigrams in the text
  fdist = nltk.FreqDist(bgs)
  for k, v in fdist.items():
    if k in negative_test_bigram_freq.keys():
      negative_test_bigram_freq[k] += v
    else:
	    negative_test_bigram_freq[k] = v

negative_test.apply(countNegativeTestBigramFrequency,axis=1);

In [None]:
# Calculate probabilities for positive test dataset
def calculatePositiveTestProbability(comment):
  prob = 1
  # total number of word tokens in positive test dataset - total number of <s>
  V = sum(positive_test_unigram_freq.values()) - len(positive_test)

  bgs = nltk.bigrams(comment)

  for bigram in list(bgs):
    unigram_count = 0
    bigram_count = 0

    prob_item = 0

    if bigram in positive_test_bigram_freq.keys():
      bigram_count = positive_test_bigram_freq[bigram]
    if bigram[0] in positive_test_unigram_freq.keys():
      unigram_count = positive_test_unigram_freq[bigram[0]]
 
    prob_item = (bigram_count + 1) / (unigram_count + V)
    prob = prob * prob_item

  return prob

In [None]:
def calculateNegativeTestProbability(comment):
  prob = 1
  # total number of word tokens in negative test dataset - total number of <s>
  V = sum(negative_test_unigram_freq.values()) - len(negative_test)

  bgs = nltk.bigrams(comment)

  for bigram in list(bgs):
    unigram_count = 0
    bigram_count = 0

    prob_item = 0

    if bigram in negative_test_bigram_freq.keys():
      bigram_count = negative_test_bigram_freq[bigram]
    if bigram[0] in negative_test_unigram_freq.keys():
      unigram_count = negative_test_unigram_freq[bigram[0]]
 
    prob_item = (bigram_count + 1) / (unigram_count + V)
    prob = prob * prob_item

  return prob

In [None]:
import numpy as np
import math

## calculate the log of the bigram probability of each word in the positive and negative test sets
total_positive_prob_log = 0
total_negative_prob_log = 0

for comment in positive_test["comment"]:
  p_prob = calculatePositiveTestProbability(comment)
  total_positive_prob_log = total_positive_prob_log + np.log(p_prob)

for comment in negative_test["comment"]:
  n_prob = calculateNegativeTestProbability(comment)
  total_negative_prob_log = total_negative_prob_log + np.log(n_prob)

print(total_positive_prob_log,total_positive_prob_log)

-2169.697980183959 -2169.697980183959


In [None]:
positive_e_power = total_positive_prob_log / sum(positive_test_unigram_freq.values())
negative_e_power = total_negative_prob_log / sum(negative_test_unigram_freq.values())

In [None]:
# Perplexity on test data
positive_perplexity = 1 / math.exp(positive_e_power)
negative_perplexity = 1 / math.exp(negative_e_power)
print(positive_perplexity,negative_perplexity)

140.10111607053304 139.086106422674
