In [1]:
# Start writing code here...

In [60]:
!pip install prettytable
from prettytable import PrettyTable

import random, nltk, re, math
import numpy as np
from collections import Counter

nltk.download('punkt')
nltk.download('gutenberg')
from nltk import ngrams, bigrams, trigrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import PlaintextCorpusReader, gutenberg

from collections import Counter, defaultdict
from operator import mul
from functools import reduce
from decimal import Decimal
from math import log, exp


You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


### Unigram

In [6]:
# UNIGRAM
def word_frequency(corpus):
    """Calculate the frequency/bigram of each words in the input-text
    Input: corpus
    Output: dictionary of words and frequency ratio, and dictionary of words and count """

    #to list of tokens/words
    text = [w for w in corpus.words()]

    # to remove special characters and empty
    text = list(filter(None, [re.sub(r'[^a-zA-Z0-9]','',string) for string in text]))

    #initiate a counter and word frequency
    word_count = Counter(text)
    word_freq = Counter(text)

    #add 
    for w in word_freq:
        word_freq[w] /= float(len(text))
    
    #sum of frequencies/probabilities must total to 1.
    if sum(word_freq.values()) > 1.01 and sum(word_freq.values()) < 0.98:
        return print('Error: the sum of frequencies is more than 1.')

    return word_freq, word_count

### Bigram

In [12]:
#BIGRAM
def create_bigram(corpus):
    """ from a given text corpus, finds the conditional probability of a word occuring and returns 
    a dicitonary from the bigram with associated probabilities as well as a table of the bigrams """

    model_dict= defaultdict(lambda: defaultdict(lambda: 0))
    
    tab = PrettyTable()
    tab.title = "bigrams for the sample's first sentences"
    tab.field_names = ['#1','#2']

    for sentence in corpus.sents():
        sentence = list(filter(None, [re.sub(r'[^a-zA-Z0-9]','',str_element) for str_element in sentence]))
        for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
            tab.add_row((w1, w2))
            model_dict[(w1)][w2] += 1

    for w1_w2 in model_dict:
        total_count = float(sum(model_dict[w1_w2].values()))
        for w2 in model_dict[w1_w2]:
            if total_count == 0:
                model_dict[w1_w2][w2] = 0
            else:
                model_dict[w1_w2][w2] /= total_count
            
    return model_dict, tab

### Trigram

In [13]:
#TRIGRAM
def create_trigram(corpus):

    model_dict= defaultdict(lambda: defaultdict(lambda: 0))
    
    tab = PrettyTable()
    tab.title = "trigrams for the sample's first sentences"
    tab.field_names = ['#1','#2', '#3']

    for sentence in corpus.sents():
        sentence = list(filter(None, [re.sub(r'[^a-zA-Z0-9]','',str_element) for str_element in sentence]))
        for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
            tab.add_row((w1, w2, w3))
            model_dict[(w1, w2)][w3] += 1

    for w1_w2 in model_dict:
        total_count = float(sum(model_dict[w1_w2].values()))
        for w3 in model_dict[w1_w2]:
            if total_count == 0:
                model_dict[w1_w2][w3] = 0
            else:
                model_dict[w1_w2][w3] /= total_count
    
    return model_dict, tab

In [14]:
word_frequencies, word_counts = word_frequency(gutenberg)
print('\n------------------- unigram output -----------------------\n')
print("probability of the word '{0}' is {1}".format('Emma', word_frequencies['Emma']))
from itertools import islice
print(dict(islice(word_frequencies.items(), 0, 5)))


bigram_model_gutenberg, bigram_tab_gutenberg = create_bigram(gutenberg)
print('\n------------------- bigram output -----------------------\n')
print("probability of the word '{0}' given '{1}' is {2}".format('world', 'the', bigram_model_gutenberg["the"]["world"]))
print("probability of the word '{0}' given '{1}' is {2}".format('lived', 'had', bigram_model_gutenberg["had"]["lived"]))
print("probability of the word '{0}' given '{1}' is {2}".format('the', 'None', bigram_model_gutenberg[None]["the"]))

print(bigram_tab_gutenberg[:20])


trigram_model_gutenberg, trigram_tab_gutenberg = create_trigram(gutenberg)
print('\n------------------- trigram output -----------------------\n')
print("probability of the word '{0}' given '{1}' and 2 is {3}".format('world', 'in','the',"{:.7f}".format( trigram_model_gutenberg["in", "the"]["world"])))
print("probability of the word '{0}' given '{1}' and 2 is {3}".format('lived','had','nearly',"{:.7f}".format(trigram_model_gutenberg["had", "nearly"]["lived"])))
print("probability of the word '{0}' given '{1}' and 2 is {3}".format('the','None','None',"{:.7f}".format(trigram_model_gutenberg[None, None]["The"])))

print(trigram_tab_gutenberg[:20])



------------------- unigram output -----------------------

probability of the word 'Emma' is 0.0003937790190355324
{'Emma': 0.0003937790190355324, 'by': 0.0036431379913541406, 'Jane': 0.00013777718564407198, 'Austen': 1.3641305509314055e-06, '1816': 4.5471018364380185e-07}

------------------- bigram output -----------------------

probability of the word 'world' given 'the' is 0.006640159045725646
probability of the word 'lived' given 'had' is 0.0015710919088766694
probability of the word 'the' given 'None' is 0.0017258357613473701
+---------------------------------------+
| bigrams for the sample's first sentences |
+-------------------+-------------------+
|         #1        |         #2        |
+-------------------+-------------------+
|        None       |        Emma       |
|        Emma       |         by        |
|         by        |        Jane       |
|        Jane       |       Austen      |
|       Austen      |        1816       |
|        1816       |        None   

### Interpolation

In [58]:
def interpolate(unigram, bigram, trigram, counts, lambda_list, corpus):
    """ Finds a interpolated probabilites of each sentence in text. 

    Input: text corpus and additionally a trained unigram, bigram, trigram, 
            counts and list of three lambda values in range 0-1. 
            Otherwise calculates the n-grams and default linear interpolation
    Output: list of probabilities for sentences in input corpus"""

    # calculate n-grams if not given as arguments
    if (unigram is None): unigram, counts = word_frequency(corpus)
    if (unigram is None): bigram, bitab = create_bigram(corpus)
    if (trigram is None): trigram, tritab = create_trigram(corpus)

    probs_list = []

    #linear interpolation if no other values are given
    if (lambda_list is None): 
        lambda1, lambda2, lambda3 = 1.0/3.0, 1.0/3.0, 1.0/3.0
    else:
        lambda1, lambda2, lambda3 = lambda_list[0], lambda_list[1], lambda_list[2]

    #prepares sentences
    for sentence in corpus.sents():
        sentence = list(filter(None, [re.sub(r'[^a-zA-Z0-9]','',str_element) for str_element in sentence]))
        trigram_sentence = list(trigrams(sentence, pad_left=True, pad_right=True))
        probability_of_sentence = 1.0

        #product of probabilities for each sentence
        for w1,w2,w3 in trigram_sentence:
            if counts[w3] > 0:
                prob_tri, prob_bi, prob_uni = trigram[w1, w2][w3], bigram[w2][w3], unigram[w3]
                probability_of_sentence *= lambda1*prob_tri + lambda2*prob_bi + lambda3*prob_uni

        probs_list.append(probability_of_sentence)

    return probs_list

interpolated_probabilities = interpolate(word_frequencies, bigram_model_gutenberg, trigram_model_gutenberg, word_counts, None, gutenberg)
interpolated_probabilities2 = interpolate(word_frequencies, bigram_model_gutenberg, trigram_model_gutenberg, word_counts, [0.1, 0.2, 0.7], gutenberg)

print('The probability of sentence {0} being in the input corpus is: {1}'.format('1st', (interpolated_probabilities[1])))
print('The probability of sentence {0} being in the input corpus is: {1}'.format('5th', (interpolated_probabilities[5])))
print('The probability of sentence {0} being in the input corpus is: {1}'.format('10th', (interpolated_probabilities[10])))

The probability of sentence 1st being in the input corpus is: 4.7060609797916105e-06
The probability of sentence 5th being in the input corpus is: 7.082434712502128e-63
The probability of sentence 10th being in the input corpus is: 2.0509775328704013e-26


### Maximise probability

In [70]:
def maximum_probability_model(unigram, bigram, trigram, counts, corpus):
    """ Finds random lambda values and saves probabilies from interpolate() function. 
    Input: text corpus, and optionally a unigram, bigram, trigram, counts
    Output: dictionary of lambda combinations and score from magnitude of probabilities
    """
    lambda1, lambda2, lambda3 = [], [], []

    #generate random lambdas
    random.seed(32)
    for i in range(20):
        random1, random2  = round(random.uniform(0, 1),1), round(random.uniform(0, 1),1)
        if random2 < (1 - random1):
            lambda1.append(random1), lambda2.append(random2), lambda3.append(round((1 - random1)-random2,1))


    score_dict = {} # key as combinations of lambdas and values as probability magnitude

    #getting the probability score for each combination of random lambdas
    for i in range(len(lambda1)):
        lambdas = [lambda1[i], lambda2[i], lambda3[i]]
        k = ','.join(map(str, lambdas)) #generates key for dict
        if k not in score_dict:
            probabilities = interpolate(word_frequencies, bigram_model_gutenberg, trigram_model_gutenberg, word_counts, lambdas, gutenberg)
            avg_prob = sum(probabilities)/len(probabilities)
            score_dict[k] = avg_prob

    return score_dict


def get_optimal_lambdas(score_dict):
    """from input dictionary of lambda combinations and probability score, 
    returns the key with maximum probability score """

    max_lambdas = max(score_dict, key=score_dict.get)
    max_prob = score_dict[max_lambdas]

    return max_lambdas, max_prob


In [71]:
score_dict = maximum_probability_model(word_frequencies, bigram_model_gutenberg, trigram_model_gutenberg, word_counts, gutenberg)
opt_lambda, opt_score = get_optimal_lambdas(score_dict)

In [73]:
print('Optimal combination of lambda is: {0} with average probability{1}/n'.format(opt_lambda, opt_score))

for key, value in score_dict.items():
    print('For lambdas {0} average probability is {1}'.format(key, value))

optimal combination of lambda is: 0.8,0.1,0.1 with average probability0.0010263715293983948
 For lambdas 0.1,0.2,0.7 average probability is 0.0009772861404375428
 For lambdas 0.1,0.5,0.4 average probability is 0.0010012735926912607
 For lambdas 0.4,0.0,0.6 average probability is 0.0009856610552172662
 For lambdas 0.8,0.1,0.1 average probability is 0.0010263715293983948
 For lambdas 0.0,0.7,0.3 average probability is 0.0010091968823131693
 For lambdas 0.1,0.1,0.8 average probability is 0.0009693307705192458
 For lambdas 0.2,0.0,0.8 average probability is 0.0009694690803112595
 For lambdas 0.0,0.0,1.0 average probability is 0.0009533332748930593
 For lambdas 0.4,0.2,0.4 average probability is 0.0010016506577498429


### Randomly generated text from trigram

In [None]:
def generate_text_trigram(trigram_model):
    """given a trigram model as input, returns a randomly generated text and 
    its probability """

    text = [None, None]
    prob = 1.0  # <- Init probability
    
    sentence_finished = False
    while not sentence_finished:
        r = random.random()
        accumulator = .0

        #iterating words in given trigram
        for word in trigram_model[tuple(text[-2:])].keys():
            accumulator += trigram_model[tuple(text[-2:])][word]

            #choose selected word based on random threshold
            if accumulator >= r:
                prob *= trigram_model[tuple(text[-2:])][word]  # <- Update the probability with the conditional probability of the new word
                text.append(word)
                break
    
        #stop argument
        if text[-2:] == [None, None]:
            sentence_finished = True

    return text, prob

generated_text, prob_gt = generate_text_trigram(trigram_model_gutenberg)
print("Probability of text=", prob_gt)  # <- Print the probability of the text
print(' '.join([t for t in generated_text if t]))
 

Probability of text= 9.19625508578272e-62
Well it looks at in all Macedonia but we were among the tribes from the Bashee isles we emerged at last it sat for some time though to his keeping his bullet and had almost told her how to be the king of creation no primal solitude Dense joyous modern populous millions cities and dwelt in the world to be a book


In [None]:
generated_text, prob_gt = generate_text_trigram(trigram_model_gutenberg)
print("Probability of text=", prob_gt)  # <- Print the probability of the text
print(' '.join([t for t in generated_text if t]))


Probability of text= 2.897215706541899e-17
This man only nodded and was just wishing I had better not talk about wasting IT


## Question 2: Naive Bayes classifier

In [26]:
# load data
Xtrain = ["the actor gives a convincing, charismatic performance as the multifaceted", "Spielberg gives us a visually spicy and historically accurate real life story", "His innovative mind entertains us now and will continue to entertain generations to come", "Unfortunately, the film has two major flaws, one in the disastrous ending", "If director actually thought this movie was worth anything", "His efforts seem fruitless, creates drama where drama shouldn't be"]
ytrain = ["entertaining", "entertaining", "entertaining", "boring", "boring", "boring"]
Xtest = "film is a innovative drama, entertains, but disastrous ending"
categories = ["entertaining", "boring"]


In [27]:
# step 1. calculate prior probabilities of sentences P(c) = Nc / total N
def prob_counter(cat, labels):
    """ as input takes list of categories and list of labels returns a dictioanary with 
    probabilities of appearance of each category """
    prob_dict = {}
    for i in cat:
        prob = labels.count(i)/len(labels)
        #name each key as prob + name of the category
        prob_dict["prob_"+i] = prob
    return prob_dict

prob_counts = prob_counter(categories, ytrain)


# step 2. calculate the vocabulary size
# Search for negative expressions and change them on two words
def neg_remover(train_set):
    """ takes traning set and finds negative parts (n't or not) 
    to add not_ to each word after that before comma if there is""" 
    tokenized_fully = []
    for i in range(len(train_set)):
        
        splitted = nltk.word_tokenize(train_set[i])
        if "not" in splitted or "n't" in splitted:

            if "n't" in splitted:
                number = splitted.index("n't")
                splitted[splitted.index("n't")] = "not"
            elif "not" in splitted: 
                number = splitted.index("not") 
        
            for j in range(len(splitted)):
                if j > number or j > number:
                    if splitted[j] == ",":
                        break
                    else:
                        splitted[j] = "not_" + splitted[j]
                  
        tokenized_fully.append(splitted)
    return tokenized_fully

# remove commas
def comma_remover(train_set):
    """ input is tokenized corpus. Function removes commas"""
    for i in train_set:
        try:
            while True:
                i.remove(",")
        except ValueError:
            pass
    return train_set


# remove the
def the_remover(train_set):
    """ input is tokenized corpus. Function removes "the" """
    for i in train_set:
        try:
            while True:
                i.remove("the")
        except ValueError:
            pass
    return train_set

# remove a
def a_remover(train_set):
    """ input is tokenized corpus. Function removes "a" """
    for i in train_set:
        try:
            while True:
                i.remove("a")
        except ValueError:
            pass
    return train_set

separ2 = a_remover(the_remover(comma_remover(neg_remover(Xtrain))))

combined = sum(separ2, [])
total_count = len(combined)
print('Total count in train data set corpus: ', total_count)
vocabulary = set(combined)
length_voc = len(vocabulary)
print('Vocabulary of train data set corpus: ', length_voc)

Total count in train data set corpus:  62
Vocabulary of train data set corpus:  56


In [28]:
# step 3. drop unknown words from the test set
neg_remover([Xtest]) #check negative sentences to change words

def word_removal(test_str):
    """  takes a test string to remove all words that does not appear in the test set """
    bag_words = []
    test_tok = word_tokenize(Xtest)
    for i in test_tok:
        if i in vocabulary:
            bag_words.append(i)
    return bag_words

final_test = word_removal(Xtest)
print(final_test)

['film', 'innovative', 'drama', 'entertains', 'disastrous', 'ending']


In [29]:
# step 4. calculate given probabilities P(w|c) = count(w|c)+1 / counts(w+c) + vocab

# create template
def template(cat, bag_words):
    """ input is a list with categories and unique words in a test set returns the dictionary 
    for further calculations of probabilities """
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for i in bag_words:
        for c in cat:
            model[i][c] = 1 # add one for smoothing instead of 0
    return model

model_ent = template(categories, final_test)

# count appearance 
def words_quantity(categories, bag_words, labels, t_train_set, model):
    """ takes template model, categories, unique words in a test set, labels, and training test
    to return the model with quantities of each word by category"""
    for i in bag_words:
        for cat in categories:
            for sent in t_train_set:
                if labels[t_train_set.index(sent)] == cat:
                    for word in sent:
                        if word == i:
                            model[i][cat] += 1
    return model

model_ent = words_quantity(categories, final_test, ytrain, separ2, model_ent)

# count number of words in each category
def category_length(category, t_train_set, labels):
    """ creates a dictionary where a key is length + category and value is number of words 
    in each category by taking a list of categories, labels and training set"""
    length = {}
    for cat in category:
        length["length_"+cat] = 0
    for sent in t_train_set:
        for cat in category:
            if labels[t_train_set.index(sent)] == cat:
                length["length_"+cat] += len(sent)
    return length
    

length_dict = category_length(categories, separ2, ytrain)
    
#count probabilities
def words_probability(model, category, cat_length, vocabulary):
    """ calculates probability in the given model with quantities of appearance of each word. 
    Input is model, a list of categories, number of words in each category and vocabulary of the training set"""
    for word in model:
        for c in model[word]:
            for cat in category:
                if cat == c:
                    model[word][c] /= cat_length["length_"+cat]+vocabulary
    
    return model

model_ent = words_probability(model_ent, categories, length_dict, length_voc)

In [30]:
# step 4. calculate given probabilities P(w|c) = count(w|c)+1 / counts(w+c) + vocab

# create template
def template(cat, bag_words):
    """ input is a list with categories and unique words in a test set returns the dictionary 
    for further calculations of probabilities """
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for i in bag_words:
        for c in cat:
            model[i][c] = 1 # add one for smoothing instead of 0
    return model

model_ent = template(categories, final_test)

# count appearance 
def words_quantity(categories, bag_words, labels, t_train_set, model):
    """ takes template model, categories, unique words in a test set, labels, and training test
    to return the model with quantities of each word by category"""
    for i in bag_words:
        for cat in categories:
            for sent in t_train_set:
                if labels[t_train_set.index(sent)] == cat:
                    for word in sent:
                        if word == i:
                            model[i][cat] += 1
    return model

model_ent = words_quantity(categories, final_test, ytrain, separ2, model_ent)

# count number of words in each category
def category_length(category, t_train_set, labels):
    """ creates a dictionary where a key is length + category and value is number of words 
    in each category by taking a list of categories, labels and training set"""
    length = {}
    for cat in category:
        length["length_"+cat] = 0
    for sent in t_train_set:
        for cat in category:
            if labels[t_train_set.index(sent)] == cat:
                length["length_"+cat] += len(sent)
    return length
    

length_dict = category_length(categories, separ2, ytrain)
    
#count probabilities
def words_probability(model, category, cat_length, vocabulary):
    """ calculates probability in the given model with quantities of appearance of each word. 
    Input is model, a list of categories, number of words in each category and vocabulary of the training set"""
    for word in model:
        for c in model[word]:
            for cat in category:
                if cat == c:
                    model[word][c] /= cat_length["length_"+cat]+vocabulary
    
    return model

model_ent = words_probability(model_ent, categories, length_dict, length_voc)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2abc63bb-dda8-43c2-995b-f7ae2a0ae9ba' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>