In [7]:
# Example exercise is credited to our instructor, Dr. Cassani.
# import the script where I implemented the algorithms
import BasicCL.basics as cl 
import pandas as pd
import numpy as np

# Minimum edit distance

Addressing the problem: find the minimal edit distance between two string

**Example 1**

In [8]:
str_A = "intention"
str_B = "execution"

D, row_trace_i, col_trace_i = cl.get_min_edit_distance_table(str_A, str_B, count_substitution=2)

In [9]:
# the minimum edit distance between the two string when substitution is counted twice
cl.get_min_edit_distance(str_A, str_B, count_substitution=2)

8

In [10]:
# view D
pd.DataFrame(D, columns=[i for i in "#"+str_B if i!=" "], index=[i for i in "#"+str_A if i!=" "])

Unnamed: 0,#,e,x,e.1,c,u,t,i,o,n
#,0,1,2,3,4,5,6,7,8,9
i,1,2,3,4,5,6,7,6,7,8
n,2,3,4,5,6,7,8,7,8,7
t,3,4,5,6,7,8,7,8,9,8
e,4,3,4,5,6,7,8,9,10,9
n,5,4,5,6,7,8,9,10,11,10
t,6,5,6,7,8,9,8,9,10,11
i,7,6,7,8,9,10,9,8,9,10
o,8,7,8,9,10,11,10,9,8,9
n,9,8,9,10,11,12,11,10,9,8


In [12]:
# view D with the trace leads to the minimum edit distance = 8 in the above table
masked_D = cl.get_masked_distance_table(D, row_trace_i, col_trace_i, strA=str_A, strB=str_B)
masked_D

Unnamed: 0,#,e,x,e.1,c,u,t,i,o,n
#,-,-,-,-,-,-,-,-,-,-
i,1,-,-,-,-,-,-,-,-,-
n,-,3,-,-,-,-,-,-,-,-
t,-,-,5,-,-,-,-,-,-,-
e,-,-,-,5,6,-,-,-,-,-
n,-,-,-,-,-,8,-,-,-,-
t,-,-,-,-,-,-,8,-,-,-
i,-,-,-,-,-,-,-,8,-,-
o,-,-,-,-,-,-,-,-,8,-
n,-,-,-,-,-,-,-,-,-,8


**Example 2**

In [14]:
str_A = "plant"
str_B = "mantle"

D, row_trace_i, col_trace_i = cl.get_min_edit_distance_table(str_A, str_B, count_substitution=1)

In [16]:
# the minimum edit distance between the two string when substitution is counted once
cl.get_min_edit_distance(str_A, str_B, count_substitution=1)

4

In [17]:
# view D
pd.DataFrame(D, columns=[i for i in "#"+str_B if i!=" "], index=[i for i in "#"+str_A if i!=" "])

Unnamed: 0,#,m,a,n,t,l,e
#,0,1,2,3,4,5,6
p,1,1,2,3,4,5,6
l,2,2,2,3,4,4,5
a,3,3,2,3,4,5,5
n,4,4,3,2,3,4,5
t,5,5,4,3,2,3,4


In [18]:
# view D with the trace leads to the minimum edit distance = 4 in the above table
masked_D = cl.get_masked_distance_table(D, row_trace_i, col_trace_i, strA=str_A, strB=str_B)
masked_D

Unnamed: 0,#,m,a,n,t,l,e
#,-,-,-,-,-,-,-
p,1,-,-,-,-,-,-
l,-,2,-,-,-,-,-
a,-,-,2,-,-,-,-
n,-,-,-,2,-,-,-
t,-,-,-,-,2,3,4


# Cocke-Kasami-Younger (CKY) parser

Addressing the problem: check if an input string is well-formed according to a context-free grammar (CFG). The grammar must be in Chomsky Normal Form (CNF) to be used as input to the CKY parsing algorithm. 

All character types in the input string must appear as terminal symbols in the grammar (lower case letter).

**Example**

In [19]:
# a grammar is in the form as follows
grammar = """
S -> AB|BC
A -> BA|a
B -> CC|b
C -> AB|a
"""

# output: grammar_input = ["S", "AB", "BC", "A", "BA", "a", "B", "CC", "b", "C", "AB", "a"]
grammar_input = grammar.replace(" -> ", " ").replace("\n", " ").replace("|", " ").strip().split(" ")
grammar_input

['S', 'AB', 'BC', 'A', 'BA', 'a', 'B', 'CC', 'b', 'C', 'AB', 'a']

In [20]:
string_input = "abba"

# print(ckyParser(string_input, grammar_input))
# print(get_cky_table(string_input, grammar_input))
pd.DataFrame(cl.get_cky_table(string_input, grammar_input), columns=list(" "+string_input))

Unnamed: 0,Unnamed: 1,a,b,b.1,a.1
0,-,"A,C","S,C",-,-
1,-,-,B,-,A
2,-,-,-,B,"A,S"
3,-,-,-,-,"A,C"
4,-,-,-,-,-


Is the "abba" well-formed according to the grammar? 

No. At least in this scheme of representation, "S" must appear in the top-right corner for the string to be considered well-formed, cell (0, 4). In other scheme of representation, the exact position can be different but the idea is the same.

# Hidden Markov Chain with the Viberti algorithm 

Addressing the problem: part-of-speech tagging

**Example 1**

In [21]:
tags = ["Det", "Adj", "Noun", "Verb"]
# states
q = np.array([[0.0, 0.2, 0.8, 0.0, 0.0], 
              [0.0, 0.3, 0.6, 0.0, 0.1],
              [0.0, 0.0, 0.0, 0.5, 0.5],
              [0.5, 0.1, 0.2, 0.0, 0.2],
              [0.5, 0.2, 0.3, 0.0, 0.0]])


unique_types = ["the", "dog", "chases", "cat", "fat"]
# observations
o = pd.DataFrame(np.array([[1.0, 0.0, 0.0, 0.0, 0.0],
                           [0.0, 0.0, 0.0, 0.0, 1.0],
                           [0.0, 0.5, 0.0, 0.4, 0.1],
                           [0.0, 0.1, 0.8, 0.1, 0.0]]), columns = unique_types, index=["Det", "Adj", "Noun", "Verb"])


# below we show the two dataframe to give a complete idea of what the matrices entail
print("An example of a trasition matrix:\n")
print(pd.DataFrame(q, columns=["Det", "Adj", "Noun", "Verb", "#eos#"], index=["Det", "Adj", "Noun", "Verb", "#bos#"]))
print("\n\nAn example of an emission matrix:\n")
print(o)

An example of a trasition matrix:

       Det  Adj  Noun  Verb  #eos#
Det    0.0  0.2   0.8   0.0    0.0
Adj    0.0  0.3   0.6   0.0    0.1
Noun   0.0  0.0   0.0   0.5    0.5
Verb   0.5  0.1   0.2   0.0    0.2
#bos#  0.5  0.2   0.3   0.0    0.0


An example of an emission matrix:

      the  dog  chases  cat  fat
Det   1.0  0.0     0.0  0.0  0.0
Adj   0.0  0.0     0.0  0.0  1.0
Noun  0.0  0.5     0.0  0.4  0.1
Verb  0.0  0.1     0.8  0.1  0.0


In [22]:
# get the trellis and the tag trace for the sentence below
sent = "The dog chases the fat cat"
trellis, trace = cl.get_viberti_trellis(state_transition=q, emission=o, sentence= sent)

In [23]:
# view the trellis
col_names = ["bos"]
col_names.extend(sent.split(" "))
col_names.append("eos")
pd.DataFrame(trellis, columns = col_names)

Unnamed: 0,bos,The,dog,chases,the,fat,cat,eos
0,0.5,0.5,0.0,0.0,0.04,0.0,0.0,0.0
1,0.2,0.0,0.0,0.0,0.0,0.032,0.0,0.1
2,0.3,0.0,0.2,0.0,0.0,0.0032,0.00768,0.5
3,0.0,0.0,0.04,0.08,0.0,0.0,0.00192,0.2


In [24]:
# view the tag for the words
[(s, tags[t]) for s, t in zip(sent.split(" "), trace)]

[('The', 'Det'),
 ('dog', 'Noun'),
 ('chases', 'Verb'),
 ('the', 'Det'),
 ('fat', 'Adj'),
 ('cat', 'Noun')]

**Example 2**

In [25]:
tags = ["A", "B", "C"]
# states
q = np.array([[0.0, 0.2, 0.8, 0.0], 
              [0.2, 0.3, 0.4, 0.1],
              [0.1, 0.2, 0.2, 0.5],
              [0.7, 0.2, 0.2, 0.5]])


unique_types = ["v", "w", "x", "y", "z"]
# observations
o = pd.DataFrame(np.array([[0.0, 0.8, 0.0, 0.2, 0.0],
                           [0.4, 0.1, 0.5, 0.0, 0.0],
                           [0.0, 0.0, 0.3, 0.1, 0.6]]), columns = unique_types)


sent = "w v y"
trellis, trace = cl.get_viberti_trellis(state_transition=q, emission=o, sentence= sent)

# view the trellis
col_names = ["bos"]
col_names.extend(sent.split(" "))
col_names.append("eos")
pd.DataFrame(trellis, columns = col_names)

Unnamed: 0,bos,w,v,y,eos
0,0.7,0.56,0.0,0.014336,0.0
1,0.2,0.02,0.1792,0.0,0.1
2,0.2,0.0,0.0,0.007168,0.5


In [26]:
# view the tag for the words
[(s, tags[t]) for s, t in zip(sent.split(" "), trace)]

[('w', 'A'), ('v', 'B'), ('y', 'A')]