In [131]:
import os
import xml.etree.ElementTree as et
import re
import string
import pandas as pd
from collections import Counter
import math
from scipy.stats import rankdata, kendalltau

# Loading documents

## Preprocessing

In [132]:
def remove_html_tags(content):
    return ''.join(et.fromstring(content).itertext())

In [133]:
def preprocess(content):
    # replace whitespaces into single space and cut at edges
    content = re.sub(r'\s+', ' ', content).strip()
    # remove punctuation
    content = content.translate(str.maketrans('','', string.punctuation))
    # lowercase
    content = content.lower()
    return content

In [134]:
def from_text_to_counter(text):
    return Counter(text.split(' '))

## Reading from file

In [135]:
test_path = 'test'

In [136]:
# d for source document
d_dict = {}

test_dirs = [f for f in os.listdir(test_path)]
test_dirs.sort()
for test_dir in test_dirs:
    test_topics = os.listdir(os.path.join(test_path, test_dir))
    test_topics.sort()
    for test_topic in test_topics:
        test_files = os.listdir(os.path.join(test_path, test_dir, test_topic))
        test_files.sort()
        topic_word_base = ''
        for test_file in test_files:
            file_path = os.path.join(test_path, test_dir, test_topic, test_file)
            root = et.parse(file_path).getroot()
            text = root.find('TEXT')
            decoded_text = et.tostring(text, encoding='unicode')
            clean_text = remove_html_tags(decoded_text)
            preprocessed_text = preprocess(clean_text)
            topic_word_base = topic_word_base + ' ' + preprocessed_text
        d_dict[test_topic] = from_text_to_counter(topic_word_base.strip())

In [137]:
d_dict

{'D0801A-A': Counter({'the': 289,
          'a380': 61,
          'new': 19,
          'airbus': 64,
          'superjumbo': 8,
          'which': 17,
          'will': 39,
          'be': 32,
          'officially': 3,
          'unveiled': 4,
          'tuesday': 6,
          'is': 47,
          'product': 1,
          'of': 116,
          'a': 73,
          'decade': 1,
          'designing': 1,
          'drumming': 1,
          'up': 10,
          'advance': 1,
          'orders': 9,
          'and': 64,
          'dreams': 1,
          'knocking': 1,
          'boeings': 5,
          '747': 12,
          'off': 6,
          'its': 34,
          'perch': 1,
          'as': 23,
          'top': 1,
          'bird': 1,
          'in': 84,
          'passenger': 9,
          'transport': 1,
          'here': 3,
          'are': 15,
          'some': 3,
          'key': 3,
          'dates': 1,
          'development': 4,
          'early': 3,
          '1990s': 1,
          'begins':

In [138]:
d_dict.keys()

dict_keys(['D0801A-A', 'D0801A-B', 'D0802A-A', 'D0802A-B', 'D0803A-A', 'D0803A-B', 'D0804A-A', 'D0804A-B', 'D0805A-A', 'D0805A-B', 'D0806A-A', 'D0806A-B', 'D0807A-A', 'D0807A-B', 'D0808B-A', 'D0808B-B', 'D0809B-A', 'D0809B-B', 'D0810B-A', 'D0810B-B', 'D0811B-A', 'D0811B-B', 'D0812B-A', 'D0812B-B', 'D0813B-A', 'D0813B-B', 'D0814C-A', 'D0814C-B', 'D0815C-A', 'D0815C-B', 'D0816C-A', 'D0816C-B', 'D0817D-A', 'D0817D-B', 'D0818D-A', 'D0818D-B', 'D0819D-A', 'D0819D-B', 'D0820D-A', 'D0820D-B', 'D0821D-A', 'D0821D-B', 'D0822D-A', 'D0822D-B', 'D0823D-A', 'D0823D-B', 'D0824E-A', 'D0824E-B', 'D0825E-A', 'D0825E-B', 'D0826E-A', 'D0826E-B', 'D0827E-A', 'D0827E-B', 'D0828E-A', 'D0828E-B', 'D0829F-A', 'D0829F-B', 'D0830F-A', 'D0830F-B', 'D0831F-A', 'D0831F-B', 'D0832F-A', 'D0832F-B', 'D0833F-A', 'D0833F-B', 'D0834F-A', 'D0834F-B', 'D0835F-A', 'D0835F-B', 'D0836G-A', 'D0836G-B', 'D0837G-A', 'D0837G-B', 'D0838G-A', 'D0838G-B', 'D0839G-A', 'D0839G-B', 'D0840G-A', 'D0840G-B', 'D0841G-A', 'D0841G-B', 'D084

# Loading summaries

In [139]:
summary_path = 'manual_models'

In [140]:
s_dict = {}

sum_files = [f for f in os.listdir(summary_path)]
sum_files.sort()
for sum_file in sum_files:
    file_path = os.path.join(summary_path, sum_file)
    with open(file_path) as file:
        text = file.read()
        preprocessed_text = preprocess(text)
        key = sum_file
        s_dict[key] = from_text_to_counter(preprocessed_text)

In [141]:
s_dict

{'D0801-A.M.100.A.A': Counter({'the': 8,
          'european': 1,
          'airbus': 3,
          'a380': 4,
          'flew': 1,
          'its': 1,
          'maiden': 1,
          'test': 2,
          'flight': 1,
          'from': 1,
          'france': 1,
          '10': 1,
          'years': 1,
          'after': 1,
          'design': 1,
          'development': 1,
          'started': 1,
          'superjumbo': 1,
          'passenger': 1,
          'jet': 1,
          'surpasses': 1,
          'boeing': 1,
          '747': 1,
          'and': 4,
          'breaks': 1,
          'their': 1,
          'monopoly': 1,
          'airlines': 2,
          'worldwide': 1,
          'have': 2,
          'placed': 2,
          'orders': 2,
          'but': 1,
          'airports': 1,
          'may': 1,
          'need': 1,
          'modification': 1,
          'to': 3,
          'accommodate': 1,
          'weight': 2,
          'width': 1,
          'of': 2,
          'us': 1,
     

# Counting theta

## Generic

### Count words in all texts

In [142]:
# WHY DO WE HAVE EMPTY STRING HERE?
all_words = set()

for key in d_dict.keys():
    if key[-1] == 'A':
        all_words.update(list(d_dict[key].keys()))
    
n_words_k = len(all_words)
n_words_k

18359

In [143]:
def get_source_document_A(name):
    parts = name.split('-')
    sub_parts = parts[1].split('.')
    return parts[0] + sub_parts[-2] + '-A'

In [144]:
def is_summary_A(name):
    return name.split('-')[1][0] == 'A'

In [166]:
# count P(S) and di,ki
values_a_dict = {}

for s_key in s_dict.keys():
    if is_summary_A(s_key):
        values_a_dict[s_key] = {}

        n_words_s = sum(s_dict[s_key].values())
        d_key = get_source_document_A(s_key)
        n_words_d = sum(d_dict[d_key].values())
        for word in s_dict[s_key].keys():
            n_occurences_s = s_dict[s_key][word]
            n_occurences_d = d_dict[d_key][word]
            n_occurences_k = 1

            Ps = n_occurences_s / n_words_s
            Pd = n_occurences_d / n_words_d
            Pk = n_occurences_k / n_words_k

            values_a_dict[s_key][word] = [Ps, Pd/Pk]

In [146]:
theta_dict_a = {}

In [167]:
for s_key in s_dict.keys():
    if is_summary_A(s_key):
        C = sum([val[1] for val in values_a_dict[s_key].values()])
        theta_dict_a[s_key] = 0
        for word in values_a_dict[s_key].keys():
            Ps = values_a_dict[s_key][word][0]
            Pd_over_Pk = values_a_dict[s_key][word][1]
            Pdk = Pd_over_Pk / C
            if Pdk == 0:
                kl = 0
            else:
                kl = Ps * math.log(Ps/Pdk)
            theta_dict_a[s_key] += kl

In [168]:
theta_dict_a

{'D0801-A.M.100.A.A': 0.4267266116186901,
 'D0801-A.M.100.A.C': 0.9226282634585241,
 'D0801-A.M.100.A.E': 0.5573398298329283,
 'D0801-A.M.100.A.G': 0.4398782112841229,
 'D0802-A.M.100.A.A': 0.5868488427077094,
 'D0802-A.M.100.A.C': 0.7797574746399519,
 'D0802-A.M.100.A.E': 0.8570613672445627,
 'D0802-A.M.100.A.H': 0.7516575135479677,
 'D0803-A.M.100.A.A': 0.4679538730599133,
 'D0803-A.M.100.A.B': 0.5703028100088001,
 'D0803-A.M.100.A.D': 0.5125759139869935,
 'D0803-A.M.100.A.F': 0.4557990838526657,
 'D0804-A.M.100.A.A': 0.6146006566068786,
 'D0804-A.M.100.A.D': 0.5283682752337899,
 'D0804-A.M.100.A.G': 0.6047740047503677,
 'D0804-A.M.100.A.H': 0.5288136525263245,
 'D0805-A.M.100.A.A': 0.6527712191020466,
 'D0805-A.M.100.A.B': 0.47395076967050537,
 'D0805-A.M.100.A.C': 0.8426195731471574,
 'D0805-A.M.100.A.F': 0.5339009896583509,
 'D0806-A.M.100.A.A': 0.5150158892211231,
 'D0806-A.M.100.A.E': 0.5035342258910599,
 'D0806-A.M.100.A.G': 0.7435889211094162,
 'D0806-A.M.100.A.H': 0.433695868

## Count Kendall's τ coefficient

### Read from file

In [149]:
human_judges_a = {}

manual = pd.read_csv('manual.model', header=None, sep=' ')
for index, row in manual.iterrows():
    if row[0][-1] == 'A':
        key = row[0] + '-' + row[1]
        human_judges_a[key] = row[4]
    
human_judges_a

{'D0801-A-A': 0.619,
 'D0801-A-C': 0.4,
 'D0801-A-E': 0.636,
 'D0801-A-G': 0.619,
 'D0802-A-A': 0.579,
 'D0802-A-C': 0.579,
 'D0802-A-E': 0.348,
 'D0802-A-H': 0.5,
 'D0803-A-A': 0.826,
 'D0803-A-B': 0.44799999999999995,
 'D0803-A-D': 0.5770000000000001,
 'D0803-A-F': 0.36700000000000005,
 'D0804-A-A': 0.5529999999999999,
 'D0804-A-D': 0.7140000000000001,
 'D0804-A-G': 1.107,
 'D0804-A-H': 0.405,
 'D0805-A-A': 0.619,
 'D0805-A-B': 0.667,
 'D0805-A-C': 0.435,
 'D0805-A-F': 0.478,
 'D0806-A-A': 0.4,
 'D0806-A-E': 0.619,
 'D0806-A-G': 0.5710000000000001,
 'D0806-A-H': 0.619,
 'D0807-A-A': 1.065,
 'D0807-A-C': 0.7140000000000001,
 'D0807-A-F': 0.579,
 'D0807-A-H': 0.8,
 'D0808-A-B': 0.652,
 'D0808-A-E': 0.522,
 'D0808-A-G': 0.8,
 'D0808-A-H': 0.591,
 'D0809-A-A': 0.706,
 'D0809-A-B': 0.7809999999999999,
 'D0809-A-D': 0.765,
 'D0809-A-F': 0.735,
 'D0810-A-B': 0.706,
 'D0810-A-C': 0.2,
 'D0810-A-E': 0.938,
 'D0810-A-H': 0.667,
 'D0811-A-A': 0.6970000000000001,
 'D0811-A-B': 0.556,
 'D0811-A-F

In [173]:
human_judges_a.values()

dict_values([0.619, 0.4, 0.636, 0.619, 0.579, 0.579, 0.348, 0.5, 0.826, 0.44799999999999995, 0.5770000000000001, 0.36700000000000005, 0.5529999999999999, 0.7140000000000001, 1.107, 0.405, 0.619, 0.667, 0.435, 0.478, 0.4, 0.619, 0.5710000000000001, 0.619, 1.065, 0.7140000000000001, 0.579, 0.8, 0.652, 0.522, 0.8, 0.591, 0.706, 0.7809999999999999, 0.765, 0.735, 0.706, 0.2, 0.938, 0.667, 0.6970000000000001, 0.556, 0.667, 0.8059999999999999, 0.64, 0.68, 0.64, 0.792, 0.821, 0.7140000000000001, 0.556, 0.69, 0.8859999999999999, 0.6509999999999999, 0.6829999999999999, 0.306, 0.35, 0.35, 0.42100000000000004, 0.19, 0.6859999999999999, 0.867, 0.611, 0.7879999999999999, 0.524, 0.667, 0.684, 0.41700000000000004, 0.7609999999999999, 0.905, 0.976, 0.688, 0.512, 0.703, 0.861, 0.65, 0.655, 0.741, 0.7020000000000001, 0.98, 0.867, 0.8, 0.8220000000000001, 0.7390000000000001, 0.667, 0.615, 0.8059999999999999, 0.523, 0.48, 0.423, 0.6, 0.667, 0.682, 0.609, 1.0, 0.636, 0.667, 0.667, 0.56, 0.857, 0.455, 0.565,

In [185]:
# human_judges_a_rank = rankdata(list(human_judges_a.values()))
# human_judges_a_rank

In [186]:
# theta_a_rank = rankdata(list(theta_dict_a.values()))
# theta_a_rank

In [187]:
# kendalltau(theta_a_rank, human_judges_a_rank)

In [190]:
kendalltau(list(theta_dict_a.values()), list(human_judges_a.values()))

KendalltauResult(correlation=-0.003285537870924226, pvalue=0.9462556920328427)

## Update

In [153]:
def get_source_document_B(name):
    parts = name.split('-')
    sub_parts = parts[1].split('.')
    return parts[0] + sub_parts[-2] + '-B'

In [154]:
def is_summary_B(name):
    return name.split('-')[1][0] == 'B'

In [155]:
def transform_B_to_A(name):
    list_name = list(name)
    list_name[-1] = 'A'
    return ''.join(list_name)

### Prepare knowledge

In [156]:
# k_dict contains documents A, d_dict contains documents B

In [157]:
k_dict = {}
d_b_dict = {}

for d_key in d_dict.keys():
    if is_summary_A(d_key):
        k_dict[d_key] = d_dict[d_key]
    else:
        d_b_dict[d_key] = d_dict[d_key]

d_dict = d_b_dict

In [158]:
# count P(S) and di,ki
values_b_dict = {}

for s_key in s_dict.keys():
    if is_summary_B(s_key):
        values_b_dict[s_key] = {}

        n_words_s = sum(s_dict[s_key].values())
        d_key = get_source_document_B(s_key)
        n_words_d = sum(d_dict[d_key].values())
        k_key = transform_B_to_A(d_key)
        n_words_k = sum(k_dict[k_key].values())
        for word in s_dict[s_key].keys():
            n_occurences_s = s_dict[s_key][word]
            n_occurences_d = d_dict[d_key][word]
            n_occurences_k = k_dict[k_key][word]

            Ps = n_occurences_s / n_words_s
            Pd = n_occurences_d / n_words_d
            Pk = n_occurences_k / n_words_k
            
            if Pk != 0:
                Pd_over_Pk = Pd / Pk
            else:
                Pd_over_Pk = 0

            values_b_dict[s_key][word] = [Ps, Pd_over_Pk]

In [159]:
theta_dict_b = {}

In [160]:
for s_key in s_dict.keys():
    if is_summary_B(s_key):
        C = sum([val[1] for val in values_b_dict[s_key].values()])
        theta_dict_b[s_key] = 0
        for word in values_b_dict[s_key].keys():
            Ps = values_b_dict[s_key][word][0]
            Pd_over_Pk = values_b_dict[s_key][word][1]
            Pdk = Pd_over_Pk / C
            if Ps == 0 or Pdk == 0:
                kl = 0
            else:
                kl = Ps * math.log(Ps/Pdk)
            theta_dict_b[s_key] += kl

In [161]:
theta_dict_b

{'D0801-B.M.100.A.A': 0.28354649255852293,
 'D0801-B.M.100.A.C': 0.1680905089922059,
 'D0801-B.M.100.A.E': 0.17464558214178316,
 'D0801-B.M.100.A.G': 0.09462991355846057,
 'D0802-B.M.100.A.A': 0.18514552639993762,
 'D0802-B.M.100.A.C': 0.10023043749873621,
 'D0802-B.M.100.A.E': 0.2497952884878482,
 'D0802-B.M.100.A.H': 0.09040279606436158,
 'D0803-B.M.100.A.A': 0.0024965081388106996,
 'D0803-B.M.100.A.B': 0.061971145390351846,
 'D0803-B.M.100.A.D': 0.08030364037340504,
 'D0803-B.M.100.A.F': 0.04065040399023443,
 'D0804-B.M.100.A.A': -0.011650098625044219,
 'D0804-B.M.100.A.D': 0.09250909211121598,
 'D0804-B.M.100.A.G': 0.02294919327410863,
 'D0804-B.M.100.A.H': 0.07571011131657496,
 'D0805-B.M.100.A.A': -0.025227246113891776,
 'D0805-B.M.100.A.B': -0.0424003104841325,
 'D0805-B.M.100.A.C': 0.05245824134019889,
 'D0805-B.M.100.A.F': 0.31009725638316676,
 'D0806-B.M.100.A.A': 0.2582497603014948,
 'D0806-B.M.100.A.E': 0.3685483234133398,
 'D0806-B.M.100.A.G': 0.0216418537407598,
 'D0806-B

## Count Kendall's τ coefficient

### Read from file

In [162]:
human_judges_b = {}

manual = pd.read_csv('manual.model', header=None, sep=' ')
for index, row in manual.iterrows():
    if row[0][-1] == 'B':
        key = row[0] + '-' + row[1]
        human_judges_b[key] = row[4]
    
human_judges_b

{'D0801-B-A': 0.818,
 'D0801-B-C': 1.0,
 'D0801-B-E': 0.583,
 'D0801-B-G': 1.053,
 'D0802-B-A': 0.3,
 'D0802-B-C': 0.25,
 'D0802-B-E': 0.364,
 'D0802-B-H': 0.217,
 'D0803-B-A': 0.654,
 'D0803-B-B': 0.72,
 'D0803-B-D': 1.0,
 'D0803-B-F': 0.32299999999999995,
 'D0804-B-A': 0.48100000000000004,
 'D0804-B-D': 0.593,
 'D0804-B-G': 0.444,
 'D0804-B-H': 0.46399999999999997,
 'D0805-B-A': 0.47100000000000003,
 'D0805-B-B': 0.5329999999999999,
 'D0805-B-C': 0.05,
 'D0805-B-F': 0.41200000000000003,
 'D0806-B-A': 0.792,
 'D0806-B-E': 0.9129999999999999,
 'D0806-B-G': 0.519,
 'D0806-B-H': 0.593,
 'D0807-B-A': 0.44,
 'D0807-B-C': 0.619,
 'D0807-B-F': 0.591,
 'D0807-B-H': 0.40700000000000003,
 'D0808-B-B': 0.636,
 'D0808-B-E': 0.542,
 'D0808-B-G': 0.7140000000000001,
 'D0808-B-H': 0.8,
 'D0809-B-A': 0.6,
 'D0809-B-B': 0.5710000000000001,
 'D0809-B-D': 0.833,
 'D0809-B-F': 0.619,
 'D0810-B-B': 0.467,
 'D0810-B-C': 0.111,
 'D0810-B-E': 0.467,
 'D0810-B-H': 0.909,
 'D0811-B-A': 1.0,
 'D0811-B-B': 0.824

In [163]:
human_judges_b_rank = rankdata(list(human_judges_b.values()))
human_judges_b_rank

array([152. , 186.5,  84. , 190. ,  11. ,   9. ,  17.5,   6. , 110. ,
       134. , 186.5,  12. ,  42.5,  88.5,  31. ,  34. ,  38. ,  61. ,
         1. ,  21.5, 147. , 178.5,  54. ,  88.5,  30. ,  99. ,  87. ,
        20. , 105.5,  63. , 131. , 148.5,  91. ,  75.5, 154.5,  99. ,
        36. ,   2. ,  36. , 176. , 186.5, 153. ,  67.5,  57. ,  85.5,
        49. ,  85.5, 164.5,  45. ,  83. , 125. , 125. , 138. ,  49. ,
        65. , 117. ,  65. , 170. , 177. ,  56. ,  65. , 101. , 103. ,
       131. ,  75.5, 120.5, 186.5,   7. ,  69.5, 166.5, 183. , 166.5,
       116. ,  73. , 169. ,  15. , 123. ,  40. ,  75.5,  75.5, 111. ,
       105.5, 172.5,  71.5, 135. , 125. , 168. ,  53. , 128. ,  91. ,
        79. , 118. , 145. ,  16. , 154.5, 171. , 113.5, 178.5,   5. ,
       120.5,  13.5,  99. ,  55. , 119. , 131. , 150.5, 138. , 156.5,
        94. , 141. ,  10. , 150.5,  32. , 109. ,  69.5, 186.5,   4. ,
        81. ,  33. , 113.5,  49. ,  23. , 163. , 102. , 122. , 105.5,
        26. ,  19. ,

In [164]:
theta_b_rank = rankdata(list(theta_dict_b.values()))
theta_b_rank

array([160., 107., 112.,  63., 123.,  66., 150.,  57.,  16.,  41.,  51.,
        34.,  14.,  61.,  27.,  46.,  13.,   8.,  39., 167., 153., 184.,
        26., 148., 181., 155., 144., 192., 126.,  88.,  91., 111.,  75.,
        30.,  21., 103., 168., 141., 191.,  90., 102., 146., 186., 176.,
        37.,  50.,  53., 100., 139., 125.,  33.,  67.,  29., 121., 145.,
       142.,  49.,  87.,  86.,  80., 136.,  95., 180., 120., 152.,  78.,
       189., 147.,  85., 134., 138., 187., 162.,   7., 169., 179.,  68.,
       129.,  76.,  52.,  48., 114.,  81.,  17.,  70.,  62.,  72., 101.,
        25.,  15.,  45.,  79.,  93., 133., 117.,  97., 140., 183.,  55.,
       175., 166., 178., 161., 151.,  24.,   3.,   4.,  31.,   9.,  36.,
        32.,  11.,  47.,   1.,  22.,  12., 124.,  58.,  83.,  99., 185.,
       157., 132., 130.,  56.,  19., 164.,  43., 128., 163.,  54.,  89.,
       115., 104., 165., 116., 106.,  74., 105.,  35.,  44.,  64.,   2.,
         6.,  10., 135.,  65., 108.,  69., 113., 15

In [165]:
kendalltau(theta_b_rank, human_judges_b_rank)

KendalltauResult(correlation=-0.009742276088295504, pvalue=0.8415028248811119)