<center>
    <h1> Natural Language Processing and Large Language Models for Research Data Exploration and Analysis
 </h1> </center>


<center> <h1> Day-1: Text Classification and Sentiment Analysis using TextBlob </h1> </center>

<center> <h2> Exercise - 02 (part - 02) </h2> </center>

<center> <h4> Raghava Mukkamala (rrm.digi@cbs.dk)  </h4> </center>


### Instructions

#### Please use Python 3 for working on the following questions.




# Exercise 02: Text Classification using NaiveBayesClassifier using NLTK

Source: https://www.nltk.org/book/ch06.html

adapted by Raghava Mukkamala


In [1]:
# !pip install prettytable

In [2]:
import nltk
from nltk.corpus import movie_reviews
import random
from prettytable import PrettyTable
import textwrap

In [3]:
nltk.download('movie_reviews')

# if you get error then you can download movie reviews by using
# nltk.download('movie_reviews') and then unpack the downloaded zip file.

print(movie_reviews.words('pos/cv957_8737.txt'))



[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


['capsule', ':', 'the', 'best', 'place', 'to', 'start', ...]


In [4]:
movie_reviews.categories()

['neg', 'pos']

In [5]:
type(movie_reviews.words('pos/cv957_8737.txt'))

## Loading and Transforming movie review documents

    Load the documents from ../nltk_data/corpora/movie_reviews and
    transform them in the following format.

    [
    ([ 'gotten', 'a', 'four', 'star', 'rating', 'out', 'of', 'me', '.'], 'pos'),
    ([ 'free', 'tickets',  'definitely', 'worth', 'checking', 'out', '.'], 'pos')
    ]


In [6]:
print ('movie_reviews.categories(): ', movie_reviews.categories())

documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]


random.shuffle(documents)

print('number of documents: ', len(documents))

tab = PrettyTable(['Document Features', 'Category'])

tab.horizontal_char = '-'

for (doc, cat) in documents[0:5]:
    feats = textwrap.fill(','.join(doc[:50]), width=80)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
#     print(cat)

print(tab)


movie_reviews.categories():  ['neg', 'pos']
number of documents:  2000
+----------------------------------------------------------------------------------+----------+
|                                Document Features                                 | Category |
+----------------------------------------------------------------------------------+----------+
|     there,was,a,huge,crowd,-,so,many,over,100,people,could,not,be,admitted,-     |   pos    |
| ,at,a,premiere,screening,of,",the,nephew,",(,first,screening,with,a,major,genera |          |
| l,audience,admittance,),.,this,was,a,movie,premiere,at,the,santa,barbara,interna |          |
|                          tional,film,festival,.,pierce                           |          |
|                                                                                  |          |
|                                                                                  |          |
| rated,:,r,for,strong,language,,,sexual,dialogue,,,drug,use,,,cr

## Generate a Frequency distribution of words

    Load all words from all the documents from the movie reviews to use
    most common words as features.

In [7]:
print('total words from movie review corpus: ', len(movie_reviews.words()))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

most_freq_words = all_words.most_common(2000)

print('most freq words: ', most_freq_words[100:110])

word_features = [word for (word, count) in most_freq_words]

print('word_features[:25]: ', word_features[:25])



total words from movie review corpus:  1583820
most freq words:  [('off', 1581), ('too', 1577), ('any', 1574), ('does', 1568), ('really', 1558), ('had', 1546), ('while', 1539), ('films', 1536), ('how', 1517), ('plot', 1513)]
word_features[:25]:  [',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he']


## Converting documents into training set containing features

    Extarcting features from a document and transforming them feature sets.

In [8]:
def get_document_features(document):
    """
        This function will convert given document into a feature set.

    """
    document_words = set(document)

    features = {}

    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function

words_doc = movie_reviews.words('pos/cv957_8737.txt')

feat_dict = get_document_features(words_doc)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}

print('transformed document features, printing the first 25 features \n\n', feat_dict_25)

# print(documents[1][1])


transformed document features, printing the first 25 features 

 {'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True}


## Preparing training set and training Naive Bayes Classifier

In [19]:

featuresets = [(get_document_features(d), c) for (d,c) in documents]

print(len(featuresets))


train_set, test_set = featuresets[100:], featuresets[:100]

print(len(test_set))

classifier = nltk.NaiveBayesClassifier.train(train_set)


print('accuracy: ', nltk.classify.accuracy(classifier, test_set))

classifier.show_most_informative_features(20)


2000
100
accuracy:  0.8
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
         contains(damon) = True              pos : neg    =      7.8 : 1.0
        contains(seagal) = True              neg : pos    =      7.8 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.6 : 1.0
        contains(poorly) = True              neg : pos    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.4 : 1.0
          contains(lame) = True              neg : pos    =      5.1 : 1.0
         contains(awful) = True              neg : pos    =      5.1 : 1.0
           contains(era) = True              pos : neg    =      5.0 : 1.0
         contains(flynt) = True              pos : neg    =      5.0 : 1.0
    contains(ridiculous) = True              neg : pos    =      4.9 : 1.0
         contains(waste) = True              neg :

## Testing the classifier



In [20]:
sample_review = """great."""

sample_review_doc_feats = get_document_features(sample_review.split())

# print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))





result of sample review:  neg




```
# This is formatted as code
```

## <font color='red'>Task - 02:</font>

    Use Reuters Corpus from nltk and build a Naive Bayes classifier for the categories of Reuters Corpus.
    Please refer to https://www.nltk.org/book/ch02.html for an example on how to access Reuters Corpus.
    Use some test documents to test the accuracy of the classifier.
    



In [25]:
# make sure that NLTK and reuters corpus is accessible

# If you get an error saying that 'Resource reuters not found.' ,
# you can download using the following code

nltk.download('reuters')
nltk.download('punkt')

from nltk.corpus import reuters

# Check how many fields in the reuters corpus to see that we have access.
print(len(reuters.fileids()))

# Check how many fileids in the reuters corpus to see that we have access.
print('number of documents: ', len(reuters.fileids()))

print('Categories: \n')
print(reuters.categories())


# Let's make a pretty table to look at the files and categories of the first 10 docs
tab = PrettyTable(['fileid', 'Category'])

index = 0

print('printing the categories for first 20 docs!')

for id in reuters.fileids():
    index += 1
    cats = textwrap.fill(','.join(reuters.categories(id)), width=40)
    tab.add_row([id, cats])
    if index == 20:
        break

print(tab)

10788
number of documents:  10788
Categories: 

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']
printing the categories for first 20 docs!

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# Let's look at the total numebr of words
print('total number of words in the reuter corpus: ', len(reuters.words()))

# And the distirbution of the words
all_words = nltk.FreqDist(w.lower() for w in reuters.words())

most_freq_words = all_words.most_common(2000)

print('most freq words: ', most_freq_words[0:110])

word_features = [word for (word, count) in most_freq_words]

print('word_features[:25]: ', word_features[:25])

total number of words in the reuter corpus:  1720901
most freq words:  [('.', 94687), (',', 72360), ('the', 69277), ('of', 36779), ('to', 36400), ('in', 29253), ('and', 25648), ('said', 25383), ('a', 25103), ('mln', 18623), ('s', 15680), ('vs', 14341), ('for', 13782), ('-', 13705), ('dlrs', 12417), ("'", 11272), ('it', 11104), ('000', 10277), ('1', 9977), ('pct', 9810), ('on', 9244), (';', 8762), ('&', 8698), ('lt', 8696), ('cts', 8361), ('from', 8217), ('is', 7668), ('that', 7540), ('year', 7529), ('>', 7449), ('its', 7402), ('by', 7101), ('at', 7017), ('net', 6989), ('"', 6816), ('2', 6528), ('u', 6392), ('be', 6357), ('with', 6179), ('will', 5952), ('billion', 5829), ('was', 5816), ('he', 5215), ('loss', 5124), ('3', 5091), ('has', 4864), ('5', 4683), ('would', 4673), ('company', 4670), ('as', 4575), ('an', 4557), ('/', 4495), ('1986', 4392), ('not', 4389), ('4', 4363), ('shr', 4182), ('inc', 4121), ('which', 3666), ('bank', 3654), ('but', 3601), ('this', 3516), ('7', 3450), ('corp'

In [28]:
# Let's get the features from documents

def get_document_features(document):
    """
        This function will convert given document into a feature set.

    """
    document_words = set(document)

    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

documents = [(list(reuters.words(fileid)), category)
    for category in reuters.categories()
    for fileid in reuters.fileids(category)]


In [30]:
featuresets = [(get_document_features(d), c) for (d,c) in documents]

#you can experiment with different train/test splits
train_set,test_set  = featuresets[1000:3000], featuresets[:1000]



In [31]:
# YOUR SOLUTION HERE

# Instantiate the Naive Bayes Classifier



# Check the accuracy



# Check the most informative features






In [32]:
# SOLUTION

# Instantiate the Naive Bayes Classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Check the accuracy
print(nltk.classify.accuracy(classifier, test_set))

# Check the most informative features
print(classifier.show_most_informative_features(20))

0.956
Most Informative Features
          contains(palm) = True           coconu : acq    =    627.9 : 1.0
   contains(agriculture) = True           copra- : acq    =    570.8 : 1.0
       contains(farmers) = True           copra- : acq    =    570.8 : 1.0
         contains(wheat) = True           barley : acq    =    500.6 : 1.0
        contains(tonnes) = True           copra- : acq    =    479.5 : 1.0
        contains(quotas) = True           coffee : acq    =    407.7 : 1.0
  contains(agricultural) = True           copra- : acq    =    342.5 : 1.0
      contains(calendar) = True           copra- : acq    =    342.5 : 1.0
   contains(commodities) = True           copra- : acq    =    342.5 : 1.0
    contains(employment) = True           copra- : acq    =    342.5 : 1.0
         contains(fresh) = True           copra- : acq    =    342.5 : 1.0
        contains(modest) = True           copra- : acq    =    342.5 : 1.0
        contains(rubber) = True           copra- : acq    =    342.5