# **CS 557 - Assignment 3 (Detecting Spelling Errors, Minimum Edit Distance, Human Morphology and Sentiment Analysis with Naïve Bayesian Classification)**

#### **Group 3 Members:**
#### Parth Parab - CWID 10444835

#### Sejal Vyas - CWID 10450395

#### Shiwani Deo - CWID 10454959

# **1. Respond to J&M 2nd Exercises 3.10 and 3.11**

### **3.10) Add an option to your program to generate random sentences.**

In [None]:
import nltk, random, re, string, collections
nltk.download('abc') #Using abc corpus for N-gram computation
nltk.download('punkt')
nltk.download('stopwords')
from nltk.util import bigrams,trigrams
from nltk.corpus import abc, stopwords
from collections import Counter, defaultdict

[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#We will be using trigrams to generate random sentences
def computeProbabilities():
  model = defaultdict(lambda: defaultdict(lambda: 0)) #Nested dictionary

  for sentence in abc.sents():
      for word1, word2, word3 in trigrams(sentence, pad_right=True, pad_left=True): #Create Trigrams
          model[(word1, word2)][word3] += 1

  for wordTuple in model: #Compute probabilities of the trigrams
      wordCount = float(sum(model[wordTuple].values()))
      for word in model[wordTuple]:
          model[wordTuple][word] /= wordCount
  return model

def generateSentence(noOfSentences=5):
  model = computeProbabilities()
  sentences = []
  sentenceCount = 0
  sentenceFinished = False
  sentences = []
  for i in range(0, noOfSentences):
    paragraph=[None, None]
    sentenceFinished = False
    while not sentenceFinished:
      randomProbability = random.uniform(0.6, 1.0) #Generate random probabilities between 0.6 and 1.0
      probability = 0

      for word in model[tuple(paragraph[-2:])].keys(): #Create trigram using the last 2 words from the generated text and a word in the model
          probability += model[tuple(paragraph[-2:])][word]
          if probability >= randomProbability: 
              paragraph.append(word)
              break

      if paragraph[-2:] == [None, None]: #If next words are 'None', sentence has ended. Append current line to sentences[] 
          sentenceFinished = True
          sentences.append(paragraph)

  finalParagraph = ""
  for i in range (0, len(sentences)):
    finalParagraph = finalParagraph + ' '.join([word for word in sentences[i] if word]) #Ignore words that are 'None' and create sentences from array of words

  return finalParagraph                                  
  
generateSentence(10) #Parameter is an integer representing the number of sentences you want to generate. Default number = 5





---



### **3.11) Add an option to your program to compute the perplexity of a test set.**

In [None]:
#Perplexity using Unigrams
#Corpus used: 'abc'

def computeUnigramProbabilities():
  model = collections.defaultdict(lambda: 0.01)

  for word in abc.words(): #Compute probabilities of the Unigram
    model[word] += 1
  
  wordCount = float(sum(model.values()))

  for word in model: #Compute probabilities of the unigrams
    model[word] /= wordCount
  return model

def unigramPerplexity(testData): #testData is a string (sentences)
  model = computeUnigramProbabilities();
  
  testData = testData.split(' ')
  perplexity = 1

  for word in testData:
    perplexity = perplexity*(1/model[word]) #Calculate P(w1 w2....wn)
  perplexity = pow(perplexity, 1/len(testData))
  return perplexity


print("The perplexity of the test set is:",unigramPerplexity("This is a test sentence for the abc corpus."))

The perplexity of the test set is: 404.85380039594196




---



### **Q2. Find Python packages that apply Bayesian logic to classification and apply one to sentiment data**

The following packages exist for Naïve Bayes classifier in python:
- NLTK classifier, scikit-learn classifier

In [None]:
#Dataset used: https://www.cs.jhu.edu/~mdredze/datasets/sentiment/
#Type of Data: Unprocessed data

#Instructions: Upload attached two XML files 'positive.review' and 'negative.review' for the training data

from bs4 import BeautifulSoup
import string

negRev = []
posRev = []
stopWords = set(stopwords.words('english')) 

#Get contents of negative review file
f = open("/content/negative.review", "r", encoding="ISO-8859-1")
soup = BeautifulSoup(f)
for review in soup.find_all('review_text'):
  negRev.append(review.text)

#Get contents of positive review file
f = open("/content/positive.review", "r", encoding="ISO-8859-1")
soup = BeautifulSoup(f)
for review in soup.find_all('review_text'):
  posRev.append(review.text)  

def isStopWord(word): #Check if input word is a stop word
  return word in stopWords

def processReviews(reviews, label):
  documents = []
  allWords = []
  reviewWords = []
  for line in reviews:
    wordList = line.split(" ") #Tokenize
    for word in wordList:
      word = word.strip('\n') #Remove trailing new line character
      word = re.sub(r'[^\w\s]','',word) #Remove punctuation
      if(not isStopWord(word)): #Filter out stop words
        reviewWords.append(word)
        allWords.append(word)
    
    documents.append((reviewWords, label))
  return documents, allWords

def trainData():
  negDocuments,allWords = processReviews(negRev, "neg")
  posDocuments,posWords = processReviews(posRev, "pos")
  negDocuments.extend(posDocuments) #Combine both negative and positive reviews
  allWords.extend(posWords)
  reviews = negDocuments 
  random.shuffle(reviews) #Shuffle all the reviews

  freqWord = nltk.FreqDist(word.lower() for word in allWords)
  wordFeatures = list(freqWord.keys())[:2000]

  def documentFeatures(document):
    documentWords = set(document) #Searching in a set is faster so convert to set
    features = {}
    for word in wordFeatures:
      features['contains({})'.format(word)] = (word in documentWords) #Create bag of words 
    return features

  featureSets = [(documentFeatures(review), label) for (review, label) in reviews] #Create feature set using bag of words
  trainSetLength = int(len(featureSets)*0.75)
  testSetLength =  int(len(featureSets)*0.25)
  trainSet, testSet = featureSets[trainSetLength:], featureSets[:testSetLength]
  classifier = nltk.NaiveBayesClassifier.train(trainSet) #Train Classifier

  classifier.show_most_informative_features(5) #Display top 5 most informative features as interpreted by the classifier

  print("\nAccuracy is: ",nltk.classify.accuracy(classifier, testSet)) #Accuracy is 100% since it's trained on the same vocab

trainData()

Most Informative Features
         contains(stars) = True              neg : pos    =      1.0 : 1.0
    contains(advertised) = True              neg : pos    =      1.0 : 1.0
          contains(does) = False             neg : pos    =      1.0 : 1.0
         contains(price) = True              neg : pos    =      1.0 : 1.0

Accuracy is:  1.0




---


### **Q3. Briefly report on SentiWordNet and how it can be used with Python and WordNet to classify a corpus of reviews**


#### **Introduction**
SentiWordNet 3.0 is a resource for sentiment classification that utilizes synsets of Wordnet and assigns a numerical score of either positivity, negativity, or objective (neutrality). 

```
e.g: The synset [estimable] has an objective score of 1.0, positive = 0.0, and negative = 0.0
```
One of the major differences between SentiWordNet version 1.0 and version 3.0 is that the semi-supervised learning algorithm used in version 1.0, for annotation, is just one of the steps in version 3.0, thus, making it more accurate.

Steps involved in annotation generation process for SentiWordNet 3.0:

**a) Step 1: Semi-Supervised Learning Step**
- **Seed set expansion:** There are two sets, each having all the synsets containing 7 positive and 7 negative terms respectively. These sets are then expanded with a certain radius 'k'. This implies that synsets that are within distance 'k' from the original seed sets members are added to the same.
- **Classifier Training:** The expanded seed sets in the previous set are used as the training set. If the classifier needs to predict an 'Object' label, another set of synsets with the *Obj* property is also included in the training. The only difference is, glosses of synsets are used for training instead of the synsets themselves.
- **Synset Classification:** This step is straightforward, all the WordNet synsets are classified as either *Pos*, *Neg*, or *Obj* by using the trained classifier in step 2.
- **Classifier Combination:** As mentioned in step 1, we create the synsets using the 'k' parameter. We can train the classifier in step 2 with different synsets depeneding upon the value of 'k'. Hence, in this step, we create a committee of ternary classifiers, each having a different combination of 'k' and learning algorithms.

**b) Step 2: Random-walk Step**
- This consists of a graph that runs iteratively using a random-walk. In the graph, a direct link exists only if synset s1 occurs in the gloss of synset s2. Two random-walk processes are executed, one for positive synsets, and one for negative synsets. First the pos and neg scores are fitted using the function $F_{Pos}(x) = a*x^b$ and similarly $F_{Neg}(x) = c*x^d$.
Then the final values are determined with a resulting function. Once they are computed, the *Obj(s)* values are assigned such that the sum is 1. 

**Note:** If *Pos(s) + Neg(s) > 1*, the values are normalized.

Testing has been done on both SentiWordNet 3.0 and SentiWordNet 1.0 to evaluate how it ranks positivity/negativity of the synsets in Micro-WN(Op)-3.0. Results state that SentiWordNet 3.0 is substantially more accurate than SentiWordNet 1.0 with an approximate 20% improvement in ranking positivity, and a 22% improvement in ranking negativity.

---

#### **Usage of SentiWordNet with Python and WordNet:**

Reference - [Reviews Classification Using SentiWordNet Lexicon](https://www.researchgate.net/publication/267249616_Reviews_Classification_Using_SentiWordNet_Lexicon
)

As dicussed above, SentiWordNet is a lexicon that assigns to each synset of WordNet three numerical scores - positivity, negativity, and objectivity (neutrality). Such lexicons that assign sentiment polarity are known as opinion lexicons. This can be used to classify a corpus of reviews efficiently. An important approach in detecting sentiment is to use a dictionary of opinionated terms.

WordNet keeps track of the senses of a word. For example, there are 4 senses of the noun 'good', 21 senses of the adjective 'good', and 2 senses of the adverb 'good' in WordNet. The word sentiment interpretation is done by computing the number of times the 'word#sense' entry is positive as compared to negative along with the total number of entries. This helps determine the sentiment polarity. 

The following is a SentiWordNet Fragment:
```
Category | WNT Number |  pos  |  neg  | Synonyms
-------- | ---------- | ----- | ----- | --------
    A    | 01123148   | 0.875 | 0     | good#1
    A    | 00106020   | 0     | 0     | good#2 full #6
    A    | 01125429   | 0     | 0.625 | bad#1
    A    | 01510444   | 0.25  | 0.25  | big#3 bad #2
```
**Technique to use SentiWordNet in Review Classification:**

- **SentiWordNet Interpretation Phase:** As demonstrated by the table above, each word has multiple senses, this makes identifying positivity/negativity of a corpus difficult. Hence the first step is to perform word sense disambiguation(WSD). 
- **Sentiment Polarity Phase:** We need to take into account the magnitude of the positive and negative scores assigned by SentiWordNet.

**Implementation:**

- Dataset used: [Amazon Product Review Set](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)

Before we use SentiWordNet for classification, we need to perform the following:
- Tokenization - Split the text into tokens for processing
- Sentence Splitting - Segment the text into sentences for the tagger
- Speech tagging - Produce a part-of-speech tag on each word or symbol

Once the above steps are completed, we can proceed with the classifier.

There are 3 methods we can use:
1. **Term Counting** - Sentiment polarity is determined by calculating the positive and negative scores. Then, the sentiment is assigned by categorizing the review based on the highest score. 
2. **Sum on Review** - The magnitude of scores is taken into account in this method, thus, the summation of the positive and negative scores for each term in the review is calculated and the sentiment is assigned accordingly.
3. **Average on Sentence and Average on Review** - The average of positive and negative scores for each term in a sentence in a review is computed. Then, the average of the positive and negative scores of a sentence is computed. The sentiment is assigned based on which score has the highest value.

The results of executing the above 3 methods on the corpus is as follows:
```
                Method                    | Accuracy (%)
----------------------------------------- | ---------- | 
Term Counting                             | 56.77
Sums on Review                            | 67.00
Average on Sentence and Average on Review | 68.63
```

`from nltk.corpus import wordnet`

`from nltk.corpus import sentiwordnet `

The above modules in Python are available to carry out sentiment classification using SentiWordNet and WordNet.





---



### **Q4. (Exercise 4.3 from BKL) Train two models, multinominal naive Bayes and binarized naive Bayes, both with add-1 smoothing, on the following document counts for key sentiment words, with positive or negative class assigned as noted.**

In [None]:
#Reference - https://kenzotakahashi.github.io/naive-bayes-from-scratch-in-python.html

import numpy as np
import pandas as pd

sentence = "A good, good plot and great characters, but poor acting." #Test sentence
document = pd.DataFrame([[3,0,3, 'positive'], [0,1,2, 'positive'],[1,3,0, 'negative'],[1,5,2, 'negative'],[0,2,0, 'negative']], columns = ['good', 'poor', 'great', 'class'])
classID = {'positive': 1, 'negative': 0}
document['ID'] = document['class'].map(classID)

class mainNaiveBayes(object):
  def __init__(self, alpha=1.0): #Add-1 smoothing
    self.alpha = alpha

  def fitModel(self, x, y):
    sampleCount = x.shape[0] #Get number of samples
    separated = [[freq for freq, token in zip(x, y) if token == sClass] for sClass in np.unique(y)] #Grouping by class
    self.computeLog = [np.log(len(freq)/sampleCount) for freq in separated] #Calculate log probability of each word (Count/length)
    count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
    self.featureProb = np.log(count/count.sum(axis=1)[np.newaxis].T)
    return self
  
  def predictLog(self, X):
    return [(self.featureProb * freq).sum(axis=1) + self.computeLog for freq in X] #Predict log probability of each class
  
  def predict(self, X):
    return np.argmax(self.predictLog(X), axis=1) #Pick maximum value of log probability

def countFeatures(sentence):
  goodCount = sentence.count("good") 
  poorCount = sentence.count("poor") 
  greatCount = sentence.count("great") 
  sentenceCount = np.array([[goodCount, poorCount, greatCount]])
  return sentenceCount

def getPrediction(prediction):
  if prediction==1:
    return "Positive"
  else:
    return "Negative"

def multinomialNaiveBayes():
  print("\nDataset for Multinomial Naive Bayes: \n",document)
  wordCount = document.iloc[:, :-2].values #Extract the frequences from the document
  labels = document.iloc[:, -1].values #Extract the classes(labels)
  classifier = mainNaiveBayes().fitModel(wordCount, labels) #Fit the model
  testSet = countFeatures(sentence) #Create feature set from the test sentence
  print("\nThe feature test set is: ", testSet)
  predictedSentiment = classifier.predict(testSet)
  print("Test Sentence: ", sentence)
  print("Sentiment using Multinomial Naïve Bayes - ", getPrediction(predictedSentiment))


def binarizedNaiveBayes():
  for row in range(document.shape[0]):
    for col in range(document.shape[1] - 2):
      if(document.iloc[row, col] > 0): 
        document.iloc[row, col] = 1 #Since frequency does not matter in binarized Naive Bayes, set values to either 0 or 1
  print("\nDataset for Binarized Naive Bayes: \n",document)
  wordCount = document.iloc[:, :-2].values #Extract the frequences from the document
  labels = document.iloc[:, -1].values #Extract the classes(labels)
  classifier = mainNaiveBayes().fitModel(wordCount, labels) #Fit the model
  testSet = countFeatures(sentence) #Create feature set from the test sentence
  print("\nThe feature test set is: ", testSet)
  predictedSentiment = classifier.predict(testSet)
  print("Test Sentence: ", sentence)
  print("Sentiment using Binarized Naïve Bayes - ", getPrediction(predictedSentiment))

multinomialNaiveBayes()
binarizedNaiveBayes()



Dataset for Multinomial Naive Bayes: 
    good  poor  great     class  ID
0     3     0      3  positive   1
1     0     1      2  positive   1
2     1     3      0  negative   0
3     1     5      2  negative   0
4     0     2      0  negative   0

The feature test set is:  [[2 1 1]]
Test Sentence:  A good, good plot and great characters, but poor acting.
Sentiment using Multinomial Naïve Bayes -  Positive

Dataset for Binarized Naive Bayes: 
    good  poor  great     class  ID
0     1     0      1  positive   1
1     0     1      1  positive   1
2     1     1      0  negative   0
3     1     1      1  negative   0
4     0     1      0  negative   0

The feature test set is:  [[2 1 1]]
Test Sentence:  A good, good plot and great characters, but poor acting.
Sentiment using Binarized Naïve Bayes -  Negative


**Result:** As demonstrated by the results above, the two models disagree.
The Multinomial Naïve Bayes model classifies the test sentence as positive, whereas the Binarized Naïve Bayes model classifies the test sentence as negative.