In [1]:
import pandas as pd
import numpy
from tqdm.notebook import trange, tqdm
import random
import math
import numpy
from dataset import Dataset
import pickle
import os
%load_ext line_profiler

In [2]:
data_file = 'data/ukwac-1L.txt'
dataset = Dataset(data_file)
training, validation, testing = dataset.partition((0.7, 0.1, 0.2), shuffle=True)

Index file not found, creating now....
3.128654956817627 Seconds
Index file found.
Index file found.
Index file found.


In [8]:
class SequenceState:
    def __init__(self):
        self.sequences = {}
        self.sequence_counts = {}
        self.num_chars = 0
    
    def normalize_probs(self):
        N = sum(self.sequence_counts.values())
        self.sequence_counts = {k:-math.log(v/N) for k, v in self.sequence_counts.items()}
        sq = {}
        for k, v in self.sequences.items():
            T = sum(v.values())
            sq[k] = {x:-math.log(y/T) for x,y in v.items()}
        self.sequences = sq
    
    def add_split_to_sequences(self, split):
        for p, c in zip(split, split[1:]):
            #print(p , c)
            try:
                self.sequences[p][c] += 1
                self.sequence_counts[p] += 1
            except:
                #print(f"{c} not in {p}")
                try:
                    self.sequences[p][c] = 1
                    self.sequence_counts[p] += 1
                except:
                 #   print(f"{p} not exists")
                    self.sequences[p] = {c: 1}
                    self.sequence_counts[p] = 1
            self.num_chars += 1
    

In [3]:
from datetime import datetime
timestamp_now = lambda : str(int(datetime.now().timestamp()))
class Model:
    def __init__(self, models_dir, model_name = timestamp_now()):
        if not os.path.exists(models_dir):
            raise Exception("Models Directory does not exist")
            
        self.model_name = model_name
        self.model_path = os.path.join(models_dir, model_name)
        
        
        self.components = {}
        
        if self.model_dir_exists():
            self.load_component_index()
        else:
            self.create_model_dir()
            
    def model_dir_exists(self):
        if not os.path.exists(self.model_path):
            return False
        return True
    
    def create_model_dir(self):
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
            
    def load_component_index(self):
        for file_name in os.listdir(self.model_path):
            component_name = '.'.join(file_name.split('.')[:-1])
            self.components[component_name] = (os.path.join(self.model_path, file_name), False)
                     
    
    def save(self, item, value):
        self.components[item] = (value, True)
        file_name = os.path.join(self.model_path, item + '.pkl')
        with open(file_name, 'wb') as f:
            pickle.dump(value, f)
        
    
    def load(self, item):
        if item in self.components:
            loaded = self.components[item][1]
            if not loaded:
                file_name = self.components[item][0]
                with open(file_name,'rb') as f:
                    self.components[item] = (pickle.load(f), True)
            return self.components[item][0]
        else:
            raise Exception(f"No Such Component={item} in the Model.")
        
        #return self.components[item]
    
    

In [57]:
seg_model = Model('models')

In [58]:
seg_model.save('some_numbers', list(range(10)))
seg_model.save('my_details', {'name':'Ganesh', 'age':29})

In [63]:
seg_model = Model('models', model_name='1594058166')

In [65]:
seg_model.load('some_numbers')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [4]:
def should_join(a, b, seq_state, theta=0.1):
    try:
        num_chars = seq_state.sequence_counts[a]
        total_chars = seq_state.num_chars
        
        try:
            prob_ab = seq_state.sequences[a][b] / num_chars
        except:
            n_distinct = len(seq_state.sequences[a])
            
            # What is the estimated probability that b follows a ?. Think of prob mass of a = a/N distributed
            # across existing n_distinct chars which follow a.
            prob_ab = seq_state.sequence_counts[a] / (total_chars * n_distinct)
        
        #print(a, b, prob_ab, theta, seq_state.sequences[a])
        if prob_ab > theta:
            return True
    except Exception as e:
        #print(a, b)
        pass
    return False


In [114]:
class MDLRunner:
    def __init__(self, training_dataset,  val_dataset, tokeniser, model):
        self.theta = 0.1
        self.inc = 0.05
        self.seq_state = SequenceState()
        self.training_dataset = training_dataset
        self.val_dataset = val_dataset
        self.scores = {}
        self.model = model
        self.tokeniser=tokeniser
    
    def evaluate(self, eval_file):
        pass

    def run_sentence(self, line):
        stack = [line[0]]
        for i in range(1, len(line)):
            p,c = stack[-1], line[i]
            #print(p, c)
            s = should_join(p, c, self.seq_state, self.theta)         
            if s:
                stack[-1] += c
            else:
                stack.append(c)
        return stack

    def run(self, times, reset=True):
        for i in range(times):
            if reset:
                self.do_reset()
            print(f"Running with theta={self.theta}")
            self.run_sequences()
            self.seq_state.normalize_probs()
            mdl_cost = self.get_mdl_cost()
            print(f"MDL cost : {mdl_cost}")
            self.scores[self.theta] = mdl_cost[-1]
            self.model.save(f'mdl_{self.theta}', [self.seq_state, mdl_cost])
            self.theta += self.inc
            
                
    def do_reset(self):
        self.seq_state = SequenceState()
        
        
    def run_sequences(self):
        
        for line in tqdm(self.training_dataset, total=len(self.training_dataset.rows)):
            #line = line.strip().replace(' ', '@')
            #line = line.split('')
            print(line)
            line = self.tokeniser(line)
            print(line)
            #line = line.replace(' ', '')
            if len(line) > 0:
                #print(line)
                #line = self.run_sentence(line)
                split = self.run_sentence(line)                
                #print(split)
                self.seq_state.add_split_to_sequences(split)
                
        
    def eval_sequence(self, seq):
        tot_prob = 0
        if seq[0] in self.seq_state.sequence_counts:
            tot_prob = self.seq_state.sequence_counts[seq[0]]        
        #print(seq[0], tot_prob)
        for i in range(1, len(seq)):
            
            if (seq[i-1] in self.seq_state.sequences) and (seq[i] in self.seq_state.sequences[seq[i-1]]):
                tot_prob += self.seq_state.sequences[seq[i-1]][seq[i]]
                #print(seq[i-1], seq[i],tot_prob)
        return tot_prob
    
    def eval_sequences(self, eval_dataset):
        tot_prob = 0
        total = len(eval_dataset.rows)
        for line in tqdm(eval_dataset, total=total):
            #line = line.strip().replace(' ', '@')
            #line = line.split('')
            #line = self.run_sentence(line)
            line = self.tokeniser(line)
            if len(line) > 0:
                split = self.run_sentence(line)
                #print(line, split)
                
                tot_prob += self.eval_sequence(list(split))
        return tot_prob / total
    
    def best_model(self):
        sorted_scores = sorted(self.scores.items(), key = lambda x : x[1])
        self.best = (sorted_scores[0])
        return self.best
    
    def get_mdl_cost(self):
        prob = self.eval_sequences(self.val_dataset)
        s = [len(v) for k, v in self.seq_state.sequences.items()]
        s = sum(s)
        return prob, s, prob
    
    
    #def split_with(self, )

In [118]:
class SplitLearner:
    def __init__(self, epochs, data_file, model_params):
        self.data_file = data_file
        self.model_dir, self.model_prefix = model_params
        self.epochs = epochs
        self.best_model = {}
    
    def save_dataset(self, epoch, dataset, model_file_name):
        save_file = f"{self.model_dir}/{self.model_prefix}_epoch_{epoch}.dataset" 
        model = Model(self.model_dir, model_file_name)
        mdl_runner = MDLRunner(None, None, None, model = model)
        theta = self.best_model[epoch]
        mdl_runner.seq_state, score = model.load(f"mdl_{theta}")
        print(f"Saving epoch={epoch} with best_model={theta}")
        if os.path.exists(save_file):
            os.remove(save_file)
        with open(save_file, 'w') as s:
            for line in tqdm(dataset, total=len(dataset.rows)):
                line = line.strip()
                split_line = mdl_runner.run_sentence(line)
                line = ' '.join(split_line)
                s.write(line + '\n')
        return save_file
                
    
    def get_tokeniser(self, mdl_runner):
        def tokenise(sentence):
            se
            return [x for x in mdl_runner.run_sentence(sentence) if len(x) > 0]
        return tokenise
    
    def run(self):
        current_file = self.data_file
        current_tok = lambda x : x
        for i in range(self.epochs):
            print(f"Running epoch={i+1}/{self.epochs}")
            model_file_name = self.model_prefix + "_epoch_" + str(i+1)
            mdl_model = Model(self.model_dir, model_file_name)
        
            dataset = Dataset(current_file)
            training, validation, testing = dataset.partition((0.7, 0.1, 0.2), shuffle=True)
            mdl_runner = MDLRunner(training, validation,  tokeniser = current_tok, model = mdl_model)
            mdl_runner.theta = 0.25
            mdl_runner.run(times=2)
            theta, score = mdl_runner.best_model()
            self.best_model[i] = theta
            current_file =self.save_dataset(i, dataset, model_file_name)
            mdl_runner.seq_state, score = mdl_model.load(f"mdl_{theta}")

            #current_tok = self.get_tokeniser(mdl_runner)
            current_tok = lambda x : x.split(' ')


In [119]:
spl_learner = SplitLearner(2, 'data/test.txt', ('models', 'mdl_model_test'))

In [120]:
spl_learner.run()

Running epoch=1/2
Index file found.
Index file found.
Index file found.
Index file found.
Running with theta=0.25


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

I like the apples and oranges

I like the apples and oranges

the cat sat on the mat

the cat sat on the mat

the apples are on the mats.

the apples are on the mats.

Hi !

Hi !

I like the apples and oranges

I like the apples and oranges

the cat sat on the mat

the cat sat on the mat

Hi !

Hi !

the cat sat on the mat

the cat sat on the mat

My name is Ganesh.

My name is Ganesh.

I like the apples and oranges

I like the apples and oranges

Hi !

Hi !

the cat sat on the mat

the cat sat on the mat

I like the apples and oranges

I like the apples and oranges

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

My name is Ganesh.

My name is Ganesh.

the apples are on the mats.

the apples are on the mats.

My name is Ganesh.

My name is Ganesh.

cats are nice and they are sat

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


MDL cost : (5.941284969881812, 172, 5.941284969881812)
Running with theta=0.3


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

I like the apples and oranges

I like the apples and oranges

the cat sat on the mat

the cat sat on the mat

the apples are on the mats.

the apples are on the mats.

Hi !

Hi !

I like the apples and oranges

I like the apples and oranges

the cat sat on the mat

the cat sat on the mat

Hi !

Hi !

the cat sat on the mat

the cat sat on the mat

My name is Ganesh.

My name is Ganesh.

I like the apples and oranges

I like the apples and oranges

Hi !

Hi !

the cat sat on the mat

the cat sat on the mat

I like the apples and oranges

I like the apples and oranges

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

cats are nice and they are sat on the mat

My name is Ganesh.

My name is Ganesh.

the apples are on the mats.

the apples are on the mats.

My name is Ganesh.

My name is Ganesh.

cats are nice and they are sat

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


MDL cost : (5.963344687946544, 171, 5.963344687946544)
Saving epoch=0 with best_model=0.25


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))


Running epoch=2/2
Index file found.
Index file found.
Index file found.
Index file found.
Running with theta=0.25


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

n  t he m at s.
H i  !
I  l 
['n', '', 't', 'he', 'm', 'at', 's.\nH', 'i', '', '!\nI', '', 'l', '']
k e t h
['k', 'e', 't', 'h']
!
M y  n am e i s Ga ne sh .
th e  ca t  s
['!\nM', 'y', '', 'n', 'am', 'e', 'i', 's', 'Ga', 'ne', 'sh', '.\nth', 'e', '', 'ca', 't', '', 's']
a nd  t
['a', 'nd', '', 't']
e a pp les  a nd  o ra ng es
H i  !
th 
['e', 'a', 'pp', 'les', '', 'a', 'nd', '', 'o', 'ra', 'ng', 'es\nH', 'i', '', '!\nth', '']
 he m at
M y  n am e i s Ga ne sh .
I  l i
['', 'he', 'm', 'at\nM', 'y', '', 'n', 'am', 'e', 'i', 's', 'Ga', 'ne', 'sh', '.\nI', '', 'l', 'i']
s  a r e  ni ce  a nd  t he y  a r e  sa t  on  t he m at
H 
['s', '', 'a', 'r', 'e', '', 'ni', 'ce', '', 'a', 'nd', '', 't', 'he', 'y', '', 'a', 'r', 'e', '', 'sa', 't', '', 'on', '', 't', 'he', 'm', 'at\nH', '']
ts  a r e  ni ce  a nd  t he
['ts', '', 'a', 'r', 'e', '', 'ni', 'ce', '', 'a', 'nd', '', 't', 'he']
 he y  
['', 'he', 'y', '', '']
es  a r e  on  t he m at s.
ca t
['es', '', 'a', 'r', 'e', '', 'on', '', 't', 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


MDL cost : (4.463926314372593, 150, 4.463926314372593)
Running with theta=0.3


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

n  t he m at s.
H i  !
I  l 
['n', '', 't', 'he', 'm', 'at', 's.\nH', 'i', '', '!\nI', '', 'l', '']
k e t h
['k', 'e', 't', 'h']
!
M y  n am e i s Ga ne sh .
th e  ca t  s
['!\nM', 'y', '', 'n', 'am', 'e', 'i', 's', 'Ga', 'ne', 'sh', '.\nth', 'e', '', 'ca', 't', '', 's']
a nd  t
['a', 'nd', '', 't']
e a pp les  a nd  o ra ng es
H i  !
th 
['e', 'a', 'pp', 'les', '', 'a', 'nd', '', 'o', 'ra', 'ng', 'es\nH', 'i', '', '!\nth', '']
 he m at
M y  n am e i s Ga ne sh .
I  l i
['', 'he', 'm', 'at\nM', 'y', '', 'n', 'am', 'e', 'i', 's', 'Ga', 'ne', 'sh', '.\nI', '', 'l', 'i']
s  a r e  ni ce  a nd  t he y  a r e  sa t  on  t he m at
H 
['s', '', 'a', 'r', 'e', '', 'ni', 'ce', '', 'a', 'nd', '', 't', 'he', 'y', '', 'a', 'r', 'e', '', 'sa', 't', '', 'on', '', 't', 'he', 'm', 'at\nH', '']
ts  a r e  ni ce  a nd  t he
['ts', '', 'a', 'r', 'e', '', 'ni', 'ce', '', 'a', 'nd', '', 't', 'he']
 he y  
['', 'he', 'y', '', '']
es  a r e  on  t he m at s.
ca t
['es', '', 'a', 'r', 'e', '', 'on', '', 't', 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


MDL cost : (6.108462204363202, 150, 6.108462204363202)
Saving epoch=1 with best_model=0.25


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [131]:
mdl_model = Model('models', 'mdl_model_test_epoch_2')

In [26]:
mdl_runner = MDLRunner(training, validation,  model = mdl_model)
#mdl_runner.inc = 0.05
mdl_runner.theta = 0.25
mdl_runner.run(times=1, epochs=1)
#mdl_runner.best_model()

Running with theta=0.25 epoch=1/1


HBox(children=(FloatProgress(value=0.0, max=70000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


MDL cost : (1.94509805027791, 16577, 1.94509805027791)


In [59]:
mdl_model.components

{'mdl_0.25': ([<__main__.SequenceState at 0x7f0e73937f28>,
   (6.939929530524779, 166, 6.939929530524779)],
  True),
 'mdl_0.3': ('models/mdl_model_test_epoch_1/mdl_0.3.pkl', False)}

In [143]:
r = random.randint(1,100)
line = testing[r]
line = "the cat sat on the mat"
print(line)
mdl_runner = MDLRunner(None, None, None, None)
for epoch, theta in spl_learner.best_model.items():
    print(f"Running with epoch={epoch} theta={theta}")
    mdl_model = Model('models', spl_learner.model_prefix + '_epoch_' + str(epoch+1))
    mdl_runner.seq_state, score = mdl_model.load(f"mdl_{theta}")
    #mdl_runner.run(times=1)
    #line = line.strip().replace(' ', '')
    if type(line) == list:
        line = ' '.join(line)
        line = [x for x in line.split(' ') if len(x) > 0]
    print(line)
    
    split_line = mdl_runner.run_sentence(line)
    print(split_line)
    print(' '.join(split_line))
    line = split_line

the cat sat on the mat
Running with epoch=0 theta=0.25
the cat sat on the mat
['th', 'e ', 'ca', 't ', 'sa', 't ', 'on', ' t', 'he m', 'at']
th e  ca t  sa t  on  t he m at
Running with epoch=1 theta=0.25
['th', 'e', 'ca', 't', 'sa', 't', 'on', 't', 'he', 'm', 'at']
['th', 'e', 'ca', 't', 'sa', 'ton', 'themat']
th e ca t sa ton themat


In [125]:
mdl_model_2 = Model('models', 'mdl_model_eng_ukwac_1L_split_1')
mdl_runner = MDLRunner(training, validation, tokeniser = lambda x : x, model = mdl_model)
#mdl_runner.inc = 0.05
mdl_runner.theta = 0.25
mdl_runner.run(times=1, epochs=1)


TypeError: run() got an unexpected keyword argument 'epochs'

In [158]:
theta, _ = sorted_scores[0]
os.remove('data/ukwac-1L.split_1.txt')
with open('data/ukwac-1L.split_1.txt', 'w') as f:
    for line in tqdm(dataset, total = len(dataset.rows)):
        #print(f"Running with theta={theta} which has an eval_score={eval_score}")
        line = line.strip()
        if len(line) > 0:
            mdl_runner.seq_state, score = mdl_model.load(f"mdl_{theta}")
            line = line.strip().replace(' ', '@')
            split_line = mdl_runner.run_sentence(line)
            split_line = ' '.join(split_line).replace('@',' ')
            f.write(split_line + '\n')

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [137]:
a = mdl_runner.eval_sequence(list('winnAru'))
b = mdl_runner.eval_sequence(['winnAru'])
print(a,b)
#mdl_runner.seq_state.sequences['c']

0 0


In [15]:
-math.log(0.001)

6.907755278982137

In [18]:
#sorted({ k : sum(v.values())  for k, v  in mdl_runner.seq_state.sequences.items() if len(k) >= 3}.items(), key = lambda x : x[1], reverse = True)
sorted({ k: v for k, v in mdl_runner.seq_state.sequence_counts.items() if len(k) > 3}.items(), key = lambda x : x[1])

[('ject', 8.4674505550976),
 ('just', 8.944582333077117),
 ('This', 9.478428607833488),
 ('quire', 9.754759820616579),
 ('quality', 10.073778045117306),
 ('quest', 10.202882693018477),
 ('question', 10.655372739771504),
 ('ment', 10.700835113848262),
 ('quired', 10.706666034159056),
 ('ques', 10.712531153611454),
 ('qual', 10.766925225677252),
 ('York', 10.950489790710805),
 ('Queen', 11.57166347164566),
 ('quir', 11.5856497136204),
 ('questions', 11.737455726488404),
 ('quirements', 11.787886580115297),
 ('quent', 11.956509292551088),
 ('qualified', 12.041667100891395),
 ('qualification', 12.110659972378347),
 ('quiry', 12.134757523957408),
 ('quiries', 12.159450136547779),
 ('quence', 12.18476794453207),
 ('qualifications', 12.38259368786199),
 ('quare', 12.44713220899956),
 ('Question', 12.481033760675242),
 ('quently', 12.516125080486512),
 ('quar', 12.552492724657386),
 ('quarter', 12.590233052640233),
 ('Quality', 12.590233052640233),
 ('quirement', 12.67027576031377),
 ('qualiti

### 