In [25]:
import numpy as np

In [26]:
def deserialize_matrix(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        matrix = []
        for l in lines:
            l = l.strip()
            l = l.replace(' ', '')
            tokens = l.split(',')
            row = [float(el) for el in tokens if el != "" and el != " "]
            matrix.append(row)
    return np.asarray(matrix)

In [27]:
input_matrix = "temp/mit_matrix_n2_k64.dat"

In [28]:
matrix = deserialize_matrix(input_matrix)

In [29]:
matrix.shape

(1599, 1599)

In [32]:
aggregated_jc = [np.sum(row) for row in matrix]

In [34]:
max(aggregated_jc)

448.19444333

In [35]:
min(aggregated_jc)

0.0

In [42]:
num_zeros = [len(matrix) - np.count_nonzero(row) for row in matrix]

In [44]:
max(num_zeros)

1599

In [46]:
min(num_zeros)

1133

In [117]:
avg_jc_per_column_norm_nonzeros = [aggr/(len(matrix) - zeros) for aggr, zeros in zip(aggregated_jc, num_zeros)]

  """Entry point for launching an IPython kernel.


In [118]:
max(avg_jc_per_column_norm_nonzeros)

1.0

In [119]:
min(avg_jc_per_column_norm_nonzeros)

0.06719495726666666

In [127]:
avg_jc_per_column = [aggr/len(matrix) for aggr in aggregated_jc if aggr > 0]  # not counting 0 aggr values!!

In [128]:
max(avg_jc_per_column)

0.2802967125265791

In [129]:
min(avg_jc_per_column)

0.0006173730456535334

## Comparing JC across ngram sizes

In [52]:
im2 = "temp/mit_matrix_n2_k64.dat"
im3 = "temp/mit_matrix_n3_k64.dat"

In [53]:
m2 = deserialize_matrix(im2)
m3 = deserialize_matrix(im3)

#### Hypothesis 1: Non-zero JC values in m3 must always be smaller than non-zero JC values in m2

In [54]:
m2.shape

(1599, 1599)

In [55]:
m3.shape

(1599, 1599)

In [90]:
# we need to keep in mind the 0.05 normal error from the lazo method, so we can relax that
fail = 0
for r2, r3 in zip(m2, m3):
    for el2, el3 in zip(r2, r3):
        if el2 > 0 and el3 > 0:
            if (el3 - 0.05) > el2:  # D value of lazo method
#                 print(str(el3) + "-" + str(el2))
                fail += 1

In [93]:
total_els = 1599*1599
perc_fail = (float(fail) / float(total_els))

In [94]:
perc_fail

0.0015887040094242767

#### 1.5 per thousand fail --> need to understand why

In [95]:
im3 = "temp/mit_matrix_n3_k64.dat"
im4 = "temp/mit_matrix_n4_k64.dat"

In [96]:
m3 = deserialize_matrix(im3)
m4 = deserialize_matrix(im4)

In [97]:
m3.shape

(1599, 1599)

In [98]:
m4.shape

(1599, 1599)

In [99]:
fail = 0
for r3, r4 in zip(m3, m4):
    for el3, el4 in zip(r3, r4):
        if el3 > 0 and el4 > 0:
            if (el4 - 0.05) > el3:  # D value of lazo method
#                 print(str(el4) + "-" + str(el3))
                fail += 1

In [100]:
total_els = 1599*1599
perc_fail = (float(fail) / float(total_els))

In [101]:
perc_fail

0.0006867957263783924

#### 0.6 per thousand fail, trend goes in the direction of supporting the hypothesis

In [103]:
im4 = "temp/mit_matrix_n4_k64.dat"
im5 = "temp/mit_matrix_n5_k64.dat"

In [104]:
m4 = deserialize_matrix(im4)
m5 = deserialize_matrix(im5)

In [105]:
m4.shape

(1599, 1599)

In [106]:
m5.shape

(1599, 1599)

In [107]:
fail = 0
for r4, r5 in zip(m4, m5):
    for el4, el5 in zip(r4, r5):
        if el4 > 0 and el5 > 0:
            if (el5 - 0.05) > el4:  # D value of lazo method
#                 print(str(el5) + "-" + str(el4))
                fail += 1

In [108]:
total_els = 1599*1599
perc_fail = (float(fail) / float(total_els))

In [109]:
perc_fail

0.0002718240488798307

#### 0.2 per thousand fail, trend goes in the direction of supporting the hypothesis

In [110]:
im5 = "temp/mit_matrix_n5_k64.dat"
im6 = "temp/mit_matrix_n6_k64.dat"

In [111]:
m5 = deserialize_matrix(im5)
m6 = deserialize_matrix(im6)

In [112]:
m5.shape

(1599, 1599)

In [113]:
m6.shape

(1599, 1599)

In [114]:
fail = 0
for r5, r6 in zip(m5, m6):
    for el5, el6 in zip(r5, r6):
        if el5 > 0 and el6 > 0:
            if (el6 - 0.05) > el5:  # D value of lazo method
#                 print(str(el6) + "-" + str(el5))
                fail += 1

In [115]:
total_els = 1599*1599
perc_fail = (float(fail) / float(total_els))

In [116]:
perc_fail

0.0006957913423844875

#### 0.6 per thousand fail, trend goes in the direction of supporting the hypothesis

## TESTING

In [152]:
non_zero_row = [el for el in m2[0] if el > 0]

In [153]:
len(non_zero_row)

18

In [154]:
non_zero_row

[0.9895834,
 0.16666667,
 0.48958334,
 0.5260416,
 0.21875,
 0.4635417,
 0.47916666,
 0.203125,
 0.46875,
 0.40625,
 0.3541667,
 0.5260416,
 0.16666667,
 0.17708333,
 0.16666667,
 0.5364584,
 0.40104166,
 0.15104166]

In [159]:
aggr = np.sum(non_zero_row)

In [160]:
aggr

6.89062506

In [161]:
norm = [el/aggr for el in non_zero_row]

In [163]:
maxnorm = max(norm)
minnorm = min(norm)
beta_scores = [(el - minnorm) / (maxnorm - minnorm) for el in norm]

In [164]:
beta_scores

[1.0,
 0.01863355066856898,
 0.4037266886678772,
 0.44720485828171175,
 0.08074534250376136,
 0.372670822563943,
 0.391304313605188,
 0.06211180376065718,
 0.37888196239342836,
 0.30434780742101164,
 0.24223605136221368,
 0.44720485828171175,
 0.01863355066856898,
 0.03105590188032862,
 0.01863355066856898,
 0.4596273764499785,
 0.29813661988966705,
 0.0]

In [165]:
final_scores = [jc * beta for jc, beta in zip(non_zero_row, beta_scores)]

In [172]:
for el in zip(final_scores, non_zero_row):
    print(el)

(0.9895834, 0.9895834)
(0.0031055918402066655, 0.16666667)
(0.19765786068515945, 0.48958334)
(0.2352483591782849, 0.5260416)
(0.017663043672697797, 0.21875)
(0.1727484666316885, 0.4635417)
(0.1874999809937905, 0.47916666)
(0.012616460138883491, 0.203125)
(0.17760091987191953, 0.46875)
(0.12364129676478598, 0.40625)
(0.08579194293198572, 0.3541667)
(0.2352483591782849, 0.5260416)
(0.0031055918402066655, 0.16666667)
(0.005499482521121854, 0.17708333)
(0.0031055918402066655, 0.16666667)
(0.24657096696655315, 0.5364584)
(0.1195652049473411, 0.40104166)
(0.0, 0.15104166)
