In [46]:
import pandas as pd
import numpy
from tqdm.notebook import trange, tqdm
import random
import math
import numpy
from dataset import Dataset
import pickle
import os
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [31]:
data_file = 'data/ukwac-10L.txt'
dataset = Dataset(data_file)
training, validation, testing = dataset.partition((0.7, 0.1, 0.2), shuffle=True)

Index file not found, creating now....
29.473777770996094 Seconds
Index file found.
Index file found.
Index file found.


In [32]:
class SequenceState:
    def __init__(self):
        self.sequences = {}
        self.sequence_counts = {}
        self.num_chars = 0
    
    def normalize_probs(self):
        N = sum(self.sequence_counts.values())
        self.sequence_counts = {k:-math.log(v/N) for k, v in self.sequence_counts.items()}
        sq = {}
        for k, v in self.sequences.items():
            T = sum(v.values())
            sq[k] = {x:-math.log(y/T) for x,y in v.items()}
        self.sequences = sq
    
    def add_split_to_sequences(self, split):
        for p, c in zip(split, split[1:]):
            #print(p , c)
            try:
                self.sequences[p][c] += 1
                self.sequence_counts[p] += 1
            except:
                #print(f"{c} not in {p}")
                try:
                    self.sequences[p][c] = 1
                    self.sequence_counts[p] += 1
                except:
                 #   print(f"{p} not exists")
                    self.sequences[p] = {c: 1}
                    self.sequence_counts[p] = 1
            self.num_chars += 1
    

In [62]:
from datetime import datetime
timestamp_now = lambda : str(int(datetime.now().timestamp()))
class Model:
    def __init__(self, models_dir, model_name = timestamp_now()):
        if not os.path.exists(models_dir):
            raise Exception("Models Directory does not exist")
            
        self.model_name = model_name
        self.model_path = os.path.join(models_dir, model_name)
        
        
        self.components = {}
        
        if self.model_dir_exists():
            self.load_component_index()
        else:
            self.create_model_dir()
            
    def model_dir_exists(self):
        if not os.path.exists(self.model_path):
            return False
        return True
    
    def create_model_dir(self):
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
            
    def load_component_index(self):
        for file_name in os.listdir(self.model_path):
            component_name = '.'.join(file_name.split('.')[:-1])
            self.components[component_name] = (os.path.join(self.model_path, file_name), False)
                     
    
    def save(self, item, value):
        self.components[item] = (value, True)
        file_name = os.path.join(self.model_path, item + '.pkl')
        with open(file_name, 'wb') as f:
            pickle.dump(value, f)
        
    
    def load(self, item):
        if item in self.components:
            loaded = self.components[item][1]
            if not loaded:
                file_name = self.components[item][0]
                with open(file_name,'rb') as f:
                    self.components[item] = (pickle.load(f), True)
            return self.components[item][0]
        else:
            raise Exception(f"No Such Component={item} in the Model.")
        
        #return self.components[item]
    
    

In [57]:
seg_model = Model('models')

In [58]:
seg_model.save('some_numbers', list(range(10)))
seg_model.save('my_details', {'name':'Ganesh', 'age':29})

In [63]:
seg_model = Model('models', model_name='1594058166')

In [65]:
seg_model.load('some_numbers')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [33]:
def should_join(a, b, seq_state, theta=0.1):
    try:
        num_chars = seq_state.sequence_counts[a]
        total_chars = seq_state.num_chars
        
        try:
            prob_ab = seq_state.sequences[a][b] / num_chars
        except:
            prob_ab = seq_state.sequence_counts[a] / total_chars
        
        
        if prob_ab > theta:
            return True
    except Exception as e:
        #print(a, b)
        pass
    return False


In [67]:
class MDLRunner:
    def __init__(self, training_dataset, val_dataset, model):
        self.theta = 0.1
        self.inc = 0.05
        self.seq_state = SequenceState()
        self.training_dataset = training_dataset
        self.val_dataset = val_dataset
        self.scores = {}
        self.model = model
    
    def evaluate(self, eval_file):
        pass

    def run_sentence(self, line):
        stack = [line[0]]
        for i in range(1, len(line)):
            p,c = stack[-1], line[i]
            s = should_join(p, c, self.seq_state, self.theta)         
            if s:
                stack[-1] += c
            else:
                stack.append(c)
        return stack

    def run(self, times):
        for i in range(times):
            self.do_reset()
            print(f"Running with theta={self.theta}")
            self.run_sequences()
            self.seq_state.normalize_probs()
            mdl_cost = self.get_mdl_cost()
            print(f"MDL cost : {mdl_cost}")
            self.scores[self.theta] = mdl_cost[-1]
            self.model.save(f'mdl_{self.theta}', [self.seq_state, mdl_cost])
            self.theta += self.inc
            
                
    def do_reset(self):
        self.seq_state = SequenceState()
        
        
    def run_sequences(self):
        
        for line in tqdm(self.training_dataset, total=len(self.training_dataset.rows)):
            line = line.strip().replace(' ', '@')
            
            if len(line) > 0:
                split = self.run_sentence(line)                
                self.seq_state.add_split_to_sequences(split)
    
    
    def eval_sequence(self, seq):
        tot_prob = 0
        if seq[0] in self.seq_state.sequence_counts:
            tot_prob = self.seq_state.sequence_counts[seq[0]]        
        #print(seq[0], tot_prob)
        for i in range(1, len(seq)):
            
            if (seq[i-1] in self.seq_state.sequences) and (seq[i] in self.seq_state.sequences[seq[i-1]]):
                tot_prob += self.seq_state.sequences[seq[i-1]][seq[i]]
                #print(seq[i-1], seq[i],tot_prob)
        return tot_prob
    
    def eval_sequences(self, eval_dataset):
        tot_prob = 0
        total = len(eval_dataset.rows)
        for line in tqdm(eval_dataset, total=total):
            line = line.strip().replace(' ', '@')

            if len(line) > 0:
                split = self.run_sentence(line)
                #print(line, split)
                
                tot_prob += self.eval_sequence(list(split))
        return tot_prob / total
    
    def best_model(self):
        sorted_scores = sorted(self.scores.items(), key = lambda x : x[1])
        self.best = (sorted_scores[0])
        return self.best
    
    def get_mdl_cost(self):
        prob = self.eval_sequences(self.val_dataset)
        s = [len(v) for k, v in self.seq_state.sequences.items()]
        s = sum(s)
        return prob, s, prob

In [68]:
mdl_model = Model('models', 'mdl_model_eng_ukwac_10L')

In [None]:
mdl_runner = MDLRunner(training, validation, mdl_model)
mdl_runner.inc = 0.05
mdl_runner.run(times=5)
#mdl_runner.best_model()

Running with theta=0.1


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


MDL cost : (7.012857343147176, 567229, 7.012857343147176)
Running with theta=0.15000000000000002


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


MDL cost : (5.937478417456796, 157143, 5.937478417456796)
Running with theta=0.2


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


MDL cost : (5.8212130376330355, 56621, 5.8212130376330355)
Running with theta=0.25


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))

In [36]:
line = testing[0]
print(line)
sorted_scores = sorted(mdl_runner.scores.items(), key = lambda x : x[1])
for theta, eval_score in sorted_scores[:2]:
    print(f"Running with theta={theta} which has an eval_score={eval_score}")
    mdl_runner.theta = theta
    mdl_runner.run(times=1)
    print(mdl_runner.run_sentence(line))

inees will usually be at E grade or above - A medical prescriber willing to contribute to and supervise the nurse 's 12/13 day
Running with theta=0.25 which has an eval_score=5.241017240638504
Running with theta=0.25


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


MDL cost : (5.241017240638504, 29659, 5.241017240638504)
['ine', 'es', ' ', 'wi', 'll', ' ', 'us', 'ua', 'll', 'y', ' ', 'be', ' ', 'at', ' ', 'E', ' ', 'gr', 'ad', 'e', ' ', 'or', ' ', 'ab', 'ov', 'e', ' ', '-', ' ', 'A', ' ', 'me', 'di', 'ca', 'l', ' ', 'pr', 'es', 'cr', 'ib', 'er', ' ', 'wi', 'll', 'in', 'g', ' ', 'to', ' ', 'co', 'nt', 'ri', 'bu', 'te', ' ', 'to', ' ', 'an', 'd', ' ', 'su', 'pe', 'rv', 'is', 'e', ' ', 'th', 'e', ' ', 'nu', 'rs', 'e', ' ', "'", 's', ' ', '12', '/1', '3', ' ', 'da', 'y']
Running with theta=0.3 which has an eval_score=5.61046574164318
Running with theta=0.3


HBox(children=(FloatProgress(value=0.0, max=700000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


MDL cost : (5.61046574164318, 21235, 5.61046574164318)
['in', 'ee', 's', ' ', 'wi', 'll', ' ', 'us', 'ua', 'll', 'y', ' ', 'b', 'e', ' ', 'at', ' ', 'E', ' ', 'gr', 'ad', 'e', ' ', 'or', ' ', 'ab', 'ov', 'e', ' ', '-', ' ', 'A', ' ', 'm', 'ed', 'ic', 'al', ' ', 'pr', 'es', 'cr', 'ib', 'er', ' ', 'wi', 'll', 'in', 'g', ' ', 'to', ' ', 'co', 'nt', 'ri', 'bu', 'te', ' ', 'to', ' ', 'an', 'd', ' ', 'su', 'pe', 'rv', 'is', 'e', ' ', 'th', 'e', ' ', 'nu', 'rs', 'e', ' ', "'", 's', ' ', '12', '/1', '3', ' ', 'da', 'y']


In [13]:
a = mdl_runner.eval_sequence(list('winnAru'))
b = mdl_runner.eval_sequence(['winnAru'])
print(a,b)
#mdl_runner.seq_state.sequences['c']

16.50114689111918 0


In [15]:
-math.log(0.001)

6.907755278982137

In [30]:
#sorted({ k : sum(v.values())  for k, v  in mdl_runner.seq_state.sequences.items() if len(k) >= 3}.items(), key = lambda x : x[1], reverse = True)
sorted({ k: v for k, v in mdl_runner.seq_state.sequence_counts.items() if len(k) > 1}.items(), key = lambda x : x[1])

[(',@', 4.9351380080344205),
 ('y@', 5.08415547369465),
 ('.\n', 5.395311980039943),
 ('d@', 5.614434038691709),
 ('ve', 5.958959565986072),
 (')@', 6.73172735708251),
 ('(@', 6.8000202564755465),
 (':@', 7.001847132816082),
 ('"@', 7.070991507977287),
 ('qu', 7.151686785430531),
 (';@', 7.914844131563125),
 ('?\n', 8.004153561564205),
 ('Th', 8.157555458129222),
 ('The@', 8.304434734612006),
 ('!\n', 8.321484327938116),
 ('The', 8.825571973963589),
 ('[@', 9.02259967814724),
 (']@', 9.045072533999297),
 ('|@', 9.104050784556232),
 ('%@', 9.261421042180679),
 ('&@', 9.424072764985866),
 ('6@', 10.60140062111945),
 ('__', 10.632037590581339),
 ('Qu', 10.696279859309692),
 ('You', 10.9064744362831),
 ('$@', 11.024257471939483),
 ('`@', 11.150298192834848),
 ('**', 11.180603542330177),
 ('*@', 11.219824255483458),
 ('”@', 11.219824255483458),
 ('5@', 11.285964057988004),
 ('}@', 11.384698898673692),
 ('–@', 11.4330174759445),
 ('You@', 11.442967806797668),
 ('’s@', 11.59379069653225),
 ('