# Language Modelling Lab (week 2)
This notebook provides the "starter" code in the week 2 lab


1.
We need to get the names of files in the training directory and split them into training and testing 50:50.

In [1]:
import os,random,math
TRAINING_DIR="sentence-completion/Holmes_Training_Data"  #this needs to be the parent directory for the training corpus

def get_training_testing(training_dir=TRAINING_DIR,split=0.5):
    
    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(53)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])
    
trainingfiles,heldoutfiles=get_training_testing()


There are 522 files in the training directory: sentence-completion/Holmes_Training_Data


In [2]:
len(trainingfiles)

261

## 1  Building a unigram model

In [3]:
from nltk import word_tokenize as tokenize
import operator
import random

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
    
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
    
    
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0  
    
    def get_common_words(self,k=10,stopword="__END"):
        #keep returning one of the highly probable words until a stopword is encountered or the max length is exceeded
        
        blacklist = ["__START"]
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        filtered = [w for (w,p) in sorted_unigram if w not in blacklist]
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choice(filtered[:k])
            if rand_word == stopword:
                break
            chosen_words.append(rand_word)
                
        return " ".join(chosen_words[:-1])
        
        
    def get_sample_from_dist(self,k=10,stopword='__END'):
        #return one of the highly probable words
        
        blacklist = ["__START"]
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        filtered = [(w,p) for (w,p) in sorted_unigram if w not in blacklist]
        values,dist = zip(*filtered[:k])
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choices(values,dist)[0]
            if rand_word == stopword:
                break
            chosen_words.append(rand_word)
                
        return " ".join(chosen_words[:-1])

In [4]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [5]:
mylm.get_prob('the')

0.03527550407241448

In [6]:
mylm.get_common_words()

"of . . , '' I the , ."

Make sure you look up some probabilities of words in your model.  Pick some words which you would expect to have high probabilities and some words which you would expect to have low probabilities.

As an extension, see how these change if you use a bigger portion of the training data to train your model.


In [7]:
mylm.get_sample_from_dist(k=100)

'you . at . , ,'

## Adding Bigrams

In [8]:
from nltk import word_tokenize as tokenize
import operator
import random
import math

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #initialise an empty dictionary which will be the bigram model
        self.bigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #words in the corpus
        self.word_count = sum(self.unigram.values())
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        
        #unigram
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
        
        #bigram
        tokens_bg = ["__START"]+tokenize(line)
        for index in range(len(tokens_bg)-1):
            if self.bigram.get(tokens_bg[index],0) == 0:
                self.bigram[tokens_bg[index]] = {}
            
            self.bigram[tokens_bg[index]][tokens_bg[index+1]] = self.bigram[tokens_bg[index]].get(tokens_bg[index+1],0)+1            
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        #unigram
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        
        #bigram
        for outer_key,inner_dict in self.bigram.items():
            inner_dict_probs = {k:v/sum(inner_dict.values()) for (k,v) in inner_dict.items()}
            self.bigram[outer_key] = inner_dict_probs
            
                   
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    
    
    def get_common_words(self,k=10,stopword='.'):
        
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choice(sorted_unigram[:k])
            chosen_words.append(rand_word)
            if rand_word[0] == stopword:
                break
                
        return chosen_words
        
    def get_sample_from_dist(self,k=10,stopword='__END', method="unigram"):
        #return one of the highly probable words
        
        current_word = "__START"
        blacklist = ["__START"]

        if method == "unigram":
            sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_unigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
        
            chosen_words = []
            for i in range(k):
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)

            return " ".join(chosen_words[:-1])
        
        
        elif method == "bigram":       
            sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
            
            chosen_words = []
            for i in range(k):
                sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
                filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
                values,dist = zip(*filtered[:k])
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)
            
            return " ".join(chosen_words[:-1])


In [9]:
lang_ml = language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [10]:
lang_ml.get_sample_from_dist(method="bigram", k=40)

"was a light in the house , for me in the next meet ? '' said nothing to a hand ? '' said Berry . Then suddenly conquered a girl 's not even , I should n't it ,"

## Computing Perplexity

In [11]:
from nltk import word_tokenize as tokenize
import operator
import random
import math

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #initialise an empty dictionary which will be the bigram model
        self.bigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #words in the corpus
        self.word_count = sum(self.unigram.values())
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        
        #unigram
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
        
        #bigram
        tokens_bg = ["__START"]+tokenize(line)
        for index in range(len(tokens_bg)-1):
            if self.bigram.get(tokens_bg[index],0) == 0:
                self.bigram[tokens_bg[index]] = {}
            
            self.bigram[tokens_bg[index]][tokens_bg[index+1]] = self.bigram[tokens_bg[index]].get(tokens_bg[index+1],0)+1            
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        #unigram
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        
        #bigram
        for outer_key,inner_dict in self.bigram.items():
            inner_dict_probs = {k:v/sum(inner_dict.values()) for (k,v) in inner_dict.items()}
            self.bigram[outer_key] = inner_dict_probs
            
                   
    def get_prob(self,token,context="",method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        elif method=="bigram":
            return self.bigram.get(context[-1],{}).get(token,0)
    
    
    def get_common_words(self,k=10,stopword='.'):
        
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choice(sorted_unigram[:k])
            chosen_words.append(rand_word)
            if rand_word[0] == stopword:
                break
                
        return chosen_words
        
    def get_sample_from_dist(self,k=10,stopword='__END', method="unigram"):
        #return one of the highly probable words
        
        current_word = "__START"
        blacklist = ["__START"]

        if method == "unigram":
            sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_unigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
        
            chosen_words = []
            for i in range(k):
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)

            return " ".join(chosen_words[:-1])
        
        
        elif method == "bigram":       
            sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
            
            chosen_words = []
            for i in range(k):
                sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
                filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
                values,dist = zip(*filtered[:k])
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)
            
            return " ".join(chosen_words[:-1])
        
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
    
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
       
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp   

In [12]:
my_ml = language_model(files=trainingfiles[:MAX_FILES])
my_ml.compute_perplexity(trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT
Processing file 0:DYNMT10.TXT
Processing file 1:09TOM10.TXT
Processing file 2:PRSIT10.TXT
Processing file 3:NWIND10.TXT
Processing file 4:BDAPH10.TXT


487.89637689524403

In [13]:
my_ml.compute_perplexity(heldoutfiles[:MAX_FILES])

Processing file 0:GGIRL10.TXT


ValueError: math domain error

## Tackling Unknown Words

In [14]:
from nltk import word_tokenize as tokenize
import operator
import random
import math

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #initialise an empty dictionary which will be the bigram model
        self.bigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #words in the corpus
        self.word_count = sum(self.unigram.values())
        #consider rare words as unknown
        self.create_unknown()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        
        #unigram
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
        
        #bigram
        tokens_bg = ["__START"]+tokenize(line)
        for index in range(len(tokens_bg)-1):
            if self.bigram.get(tokens_bg[index],0) == 0:
                self.bigram[tokens_bg[index]] = {}
            
            self.bigram[tokens_bg[index]][tokens_bg[index+1]] = self.bigram[tokens_bg[index]].get(tokens_bg[index+1],0)+1            
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
    
    
    def create_unknown(self):
        
        unk_keys = []
        self.unigram["_UNK"] = 0
        for w,f in self.unigram.items():
            if f <= 2:
                self.unigram["_UNK"] = self.unigram.get("_UNK", 0) + f
                unk_keys.append(w)
        
        for i in unk_keys:
            del self.unigram[i]
        
        unk_outer_keys = []
        self.bigram["_UNK"] = {}
        for outer,inner in self.bigram.items():
            unk_inner_keys = []
            inner["_UNK"] = 0
            for w,f in inner.items():
                if f <= 2:
                    inner["_UNK"] = inner.get("_UNK", 0) + f
                    unk_inner_keys.append(w)
            for i in unk_inner_keys:
                del self.bigram[outer][i]
            
            
            if outer not in self.unigram.keys():
                self.bigram["_UNK"] |= self.bigram[outer]
                unk_outer_keys.append(outer)
        for i in unk_outer_keys:
            del self.bigram[i] 
    
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        #unigram
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        
        #bigram
        for outer_key,inner_dict in self.bigram.items():
            inner_dict_probs = {k:v/sum(inner_dict.values()) for (k,v) in inner_dict.items()}
            self.bigram[outer_key] = inner_dict_probs
            
                   
    def get_prob(self,token,context="",method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,self.unigram["_UNK"])
        elif method=="bigram":
            return self.bigram.get(context[-1],self.bigram["_UNK"]).get(token,self.bigram[context[-1]]["_UNK"])
    
    
    def get_common_words(self,k=10,stopword='.'):
        
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choice(sorted_unigram[:k])
            chosen_words.append(rand_word)
            if rand_word[0] == stopword:
                break
                
        return chosen_words
        
    def get_sample_from_dist(self,k=10,stopword='__END', method="unigram"):
        #return one of the highly probable words
        
        current_word = "__START"
        blacklist = ["__START"]

        if method == "unigram":
            sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_unigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
        
            chosen_words = []
            for i in range(k):
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)

            return " ".join(chosen_words[:-1])
        
        
        elif method == "bigram":       
            sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
            
            chosen_words = []
            for i in range(k):
                sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
                filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
                values,dist = zip(*filtered[:k])
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)
            
            return " ".join(chosen_words[:-1])
        
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
    
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
       
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp   

In [15]:
my_new_ml = language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [16]:
my_new_ml.compute_perplexity(heldoutfiles[:MAX_FILES])

Processing file 0:GGIRL10.TXT
Processing file 1:SBRUN10.TXT
Processing file 2:TARZ510.TXT
Processing file 3:ASPRN10.TXT
Processing file 4:TBTAS10.TXT


314.40734307861635

## Smoothing

In [60]:
from nltk import word_tokenize as tokenize
import operator
import random
import math

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #initialise an empty dictionary which will be the bigram model
        self.bigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #words in the corpus
        self.word_count = sum(self.unigram.values())
        #consider rare words as unknown
        self.create_unknown()
        #apply discount to the bigrams
        self.apply_discount()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        
        #unigram
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
        
        #bigram
        tokens_bg = ["__START"]+tokenize(line)
        for index in range(len(tokens_bg)-1):
            if self.bigram.get(tokens_bg[index],0) == 0:
                self.bigram[tokens_bg[index]] = {}
            
            self.bigram[tokens_bg[index]][tokens_bg[index+1]] = self.bigram[tokens_bg[index]].get(tokens_bg[index+1],0)+1            
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
    
    
    def create_unknown(self):
        
        unk_keys = []
        self.unigram["_UNK"] = 0
        for w,f in self.unigram.items():
            if f <= 2:
                self.unigram["_UNK"] = self.unigram.get("_UNK", 0) + f
                unk_keys.append(w)
        
        for i in unk_keys:
            del self.unigram[i]
        
        unk_outer_keys = []
        self.bigram["_UNK"] = {}
        for outer,inner in self.bigram.items():
            unk_inner_keys = []
            self.bigram[outer]["_UNK"] = 0
            for w,f in inner.items():
                if f <= 2:
                    self.bigram[outer]["_UNK"] = self.bigram[outer].get("_UNK", 0) + f
                    unk_inner_keys.append(w)
            for i in unk_inner_keys:
                del self.bigram[outer][i]
            
            
            if outer not in self.unigram.keys():
                self.bigram["_UNK"] |= self.bigram[outer]
                unk_outer_keys.append(outer)
        for i in unk_outer_keys:
            del self.bigram[i] 
    
    
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        #unigram
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        
        #bigram
        for outer_key,inner_dict in self.bigram.items():
            inner_dict_probs = {k:v/sum(inner_dict.values()) for (k,v) in inner_dict.items()}
            self.bigram[outer_key] = inner_dict_probs
            
                   
    def get_prob(self,token,context="",method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,self.unigram["_UNK"])
        elif method=="bigram":
            #return self.bigram.get(context[-1],self.bigram["_UNK"]).get(token,self.bigram[context[-1]]["_UNK"])
            
            bigram=self.bigram.get(context[-1],self.bigram.get("_UNK",{}))
            big_p=bigram.get(token,bigram.get("_UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=self.unigram.get(token,self.unigram.get("_UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p            
            return p
    
    def get_common_words(self,k=10,stopword='.'):
        
        sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
        
        chosen_words = []
        for i in range(k):
            rand_word = random.choice(sorted_unigram[:k])
            chosen_words.append(rand_word)
            if rand_word[0] == stopword:
                break
                
        return chosen_words
        
    def get_sample_from_dist(self,k=10,stopword='__END', method="unigram"):
        #return one of the highly probable words
        
        current_word = "__START"
        blacklist = ["__START"]

        if method == "unigram":
            sorted_unigram = sorted(self.unigram.items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_unigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
        
            chosen_words = []
            for i in range(k):
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)

            return " ".join(chosen_words[:-1])
        
        
        elif method == "bigram":       
            sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
            filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
            values,dist = zip(*filtered[:k])
            
            chosen_words = []
            for i in range(k):
                sorted_bigram = sorted(self.bigram[current_word].items(), key=lambda item: item[1], reverse=True)
                filtered = [(w,p) for (w,p) in sorted_bigram if w not in blacklist]
                values,dist = zip(*filtered[:k])
                current_word = random.choices(values,dist)[0]
                if current_word == stopword:
                    break
                chosen_words.append(current_word)
            
            return " ".join(chosen_words[:-1])
        
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
    
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
       
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp   
    
    
    def apply_discount(self, discount=0.75):
        """ for outer,inner in self.bigram.items():
            total_discount = 0
            for w,c in inner.items():
                self.bigram[outer][w] -= 0.75
                total_discount += 0.75
            
            self.bigram[outer]["_DISCOUNT"] = total_discount"""
            
            
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount

In [35]:
model_2=language_model(files=trainingfiles[:MAX_FILES])

Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


In [37]:
model_2.bigram["grass"]

{'in': 0.05508474576271186,
 '.': 0.1228813559322034,
 ',': 0.17372881355932204,
 'and': 0.05508474576271186,
 'grew': 0.038135593220338986,
 '_UNK': 0.4788135593220339,
 '_DISCOUNT': 0.07627118644067797}

In [61]:
MAX_FILES=5

filesets={"training":trainingfiles[:MAX_FILES],"testing":heldoutfiles[:MAX_FILES]}


my_lm_2=language_model(files=filesets["training"])
methods=["unigram","bigram"]
#methods=["bigram"]

for f,names in list(filesets.items()):
    for m in methods:

        p=my_lm.compute_perplexity(filenames=names,method=m)
        
        print("Perplexity on {} with {} method is {}".format(f,m,p))


Processing DYNMT10.TXT
Processing 09TOM10.TXT
Processing PRSIT10.TXT
Processing NWIND10.TXT
Processing BDAPH10.TXT


RuntimeError: dictionary changed size during iteration