In [25]:
import numpy as np
from nltk.corpus import brown
import operator

In [26]:
brown_sentences = brown.sents()
print(brown_sentences[:400])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]


In [27]:
brown_sentences = list(map(lambda x: ' '.join(x).lower(), brown_sentences))

In [42]:
def get_vocab_limited(vocab_size = 2000):
    """ 
    -----------------------------
    Description :
    Function to return a limited vocabulary
    
    Input : 
    vocab_size : The vocabulary size required
    
    Return :
    sentence_small : The sentences consisting of the limited vocabulary (list structure)
    word2idx_small : Limited vocabularies (dict structure)
    ------------------------------
    
    """
    
    word2idx = {"START" : 0, "END" : 1}
    idx2word = ["START", "END"]
    word2idx_count = {0:float('inf'), 1:float('inf')}
    
    indexed_sentences = []
    sentences_small = []
    word2idx_small = {}
    n = 0
    
    for sentences in brown_sentences:
        # sentences : a string value  eg. "The quick brown fox jumps over the lazy dog"
        
        words = sentences.split()
        
        # words : list structure ["The", "quick", "brown" ....]
        indexed_sentence = []
        
        for token in words:
            
            if token not in word2idx:
                word2idx[token] = n
                n += 1
                idx2word.append(token)
                word2idx_count[word2idx[token]] = 1
                
            word2idx_count[word2idx[token]] += 1
            indexed_sentence.append(word2idx[token])
            
        indexed_sentences.append(indexed_sentence)
            
    
    new_sorted_vocab = sorted([(k,v) for k,v in word2idx_count.items()], key=operator.itemgetter(1), reverse = True)
    # new_sorted_vocab : list structure [(1, 200), (2, 500) ....] where the first index of every tuple is the word-index and the second index is it's count
    
    new_idx = 0
    new_idx2idx_map = {}
    
    for k,v in new_sorted_vocab[:vocab_size]:
        if k not in word2idx_small:
            word = idx2word[k]
            word2idx_small[word] = new_idx
            
            #Creating a new map of index to index as we are selecting a certain amount of word

            new_idx2idx_map[word2idx[word]] = new_idx
            new_idx += 1
        
    word2idx_small["UNKNOWN"] = new_idx
    unknown = new_idx
    
    # indexed_sentence : list of list structure 
    for sentence in indexed_sentences:
        if len(sentence) > 1:
            new_sentence = [new_idx2idx_map[idx] if idx in new_idx2idx_map else unknown for idx in sentence]
            sentences_small.append(new_sentence)
            
    
    return sentences_small, word2idx_small
        
    
    
        

In [40]:
def bigram_prob(sentences, V, start_idx, end_idx, smoothning=1):
    """
    --------------------------------------------------
    Description :
    Function to form the bigram model of a sentence
    
    Input :
    sentences : list of list structure 
    V : Vocab size
    start_idx : Index value of "START" token
    end_idx : Index value of "END" token
    smoothning : smoothning value to be added
    
    Return :
    mat : n-dimensional matrix (The Bigram Model)
    -------------------------------------------------
    
    """
    mat = np.ones((V, V)) * smoothning # Add-one smoothining hence intializing the matrix with ones
    for sentence in sentences:
        # sentence : list of integers
        words = sentence
        for i in range(len(words)):
            if i == 0:
                mat[start_idx, words[i]] += 1
            
            elif i == len(words) - 1:
                mat[words[i], end_idx] += 1
            
            else:
                mat[words[i - 1], words[i]] += 1
                
    mat /= mat.sum(axis = 1, keepdims = True)
    return mat

In [43]:
sentences, word2idx = get_vocab_limited(vocab_size=10000)
print("Vocab size : {}".format(len(word2idx)))

Vocab size : 10001


In [44]:
start_idx = word2idx["START"]
end_idx = word2idx["END"]
bigram_mat = bigram_prob(sentences, len(word2idx), start_idx, end_idx)
print("Shape of Bigram probabilities : {}".format(bigram_mat.shape))

Shape of Bigram probabilities : (10001, 10001)


In [57]:
def get_score(sentence):
    """ 
    --------------------------------------
    Description:
    Function to determine the log-probability score of a sentence 
    
    Input:
    sentence : list of integers containing indices of each word in a sentence
    
    Return:
    prob_score : log probability score of a sentence
    ---------------------------------------
    
    """
    score = 0
    # score calculation = (log p(w_1) + summation(p(w_t | w_t-1))) / T
    for i in range(len(sentence)):
        if i == 0:
            score += np.log(bigram_mat[start_idx, sentence[i]])
        else:
            score += np.log(bigram_mat[sentence[i - 1], sentence[i]])
    
    # For the end 
    score += np.log(bigram_mat[sentence[-1], end_idx])
    prob_score = score / (len(sentence) + 1) 
    return prob_score

In [47]:
idx2word = {v:k for k,v in word2idx.items()}

In [48]:
def get_words(sentence):
    return ' '.join([idx2word[i] for i in sentence])

In [70]:
# For fake sentence sampling :
sample_p = np.ones(len(word2idx))

# When we sample fake sentence we need to make sure not to sample "START" or "END" token

sample_p[start_idx] = 0
sample_p[end_idx] = 0
sample_p /= sample_p.sum()
while True:
    real_idx = np.random.choice(len(sentences))
    
    #sentence : list of list structure (containing indices of words in a sentence)
    real = sentences[real_idx]
    # Test our bigram model for fake sentence 
    fake = np.random.choice(len(word2idx), size=len(real), p=sample_p)
    
    print("Real Sentence : {} | Score : {:.7f}".format(get_words(real), get_score(real)))
    print("Fake Sentence : {} | Score : {:.7f}".format(get_words(fake), get_score(fake)))
    
    
    custom_sentence = input("Enter a text : ")
    custom_sentence = custom_sentence.strip().lower().split()
    bad_sentence = False
    for words in custom_sentence:
        if words not in word2idx:
            print("You entered have entered words which are not in the vocabulary")
            bad_sentence = True
            break
    if not bad_sentence:
        custom_sentence_list = [word2idx[words] for words in custom_sentence]
        print("Custom Sentence : {} | Score : {:.7f}".format(get_words(custom_sentence_list),get_score(custom_sentence_list)))
    
    cont = input("Do you want to continue Y/N? : ")
    if cont in ('N', 'n'):
        break

Real Sentence : `` i don't UNKNOWN '' , i told UNKNOWN , `` except that i UNKNOWN be here '' . | Score : -4.5254852
Fake Sentence : two-inch armour lucifer vientiane hegel's wackers' ratification friday supported perpetually avaliable surprisingly cloakrooms omissions commodity impute 3-inch cut based | Score : -9.3078275


Enter a text :  i don't know


You entered a words which is not in the vocabulary


Do you want to continue Y/N? :  U


Real Sentence : u. s. UNKNOWN to UNKNOWN a neutral laos UNKNOWN have led premier UNKNOWN to believe that other areas UNKNOWN be `` neutralized '' on UNKNOWN terms . | Score : -6.0889485
Fake Sentence : anti-communists hershey's assigned handed hobbled conspire naked accredited stagecoach lush sprinkle frick gassing holidays york scots inverted seat viewed underwriters coalition tax wilsonian pilot delegating revelation distinctions | Score : -9.2811271


Enter a text :  a neutral laos


Custom Sentence : a neutral laos | Score : -7.4247563


Do you want to continue Y/N? :  N
