# Results for clustering and classification

Notebook that shows the main results for clustering and classification. Used in [this study](https://github.com/luka5132/Agglomerative_Clustering_PSS/blob/main/DSS_Thesis_LukasBusch.pdf) that investigates the use of clustering in PSS.

In this notebook pages have already been transformed into representational vectors. Please look at the github for information on how this was done.


## Loading in data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# First two corpora

Other corpora not used in the end for thesis. Usis data from van Heusden et al. 
(https://irlab.science.uva.nl/wp-content/papercite-data/pdf/van-2022-wooir.pdf)

Used to compare results against and do testing

In [None]:
import pickle
from tqdm import tqdm
import os 
import numpy as np
import pandas as pd
import json

cp1_json_path = '/content/gdrive/MyDrive/master_thesis/corpus_1/Doclengths_of_the_individual_docs_TRAIN.json'
cp2_json_path = '/content/gdrive/MyDrive/master_thesis/corpus_2/Doclengths_of_the_individual_docs_TRAIN.json'

with open(cp1_json_path) as j_obj:
  cp1_gs = json.load(j_obj)
  j_obj.close()

with open(cp2_json_path) as j_obj:
  cp2_gs = json.load(j_obj)
  j_obj.close()

cp_combined_gs = cp1_gs.copy()
cp_combined_gs.update(cp2_gs)

cp1_clean_path = '/content/gdrive/MyDrive/master_thesis/corpus_1/cp1_cleaned.csv'
cp2_clean_path = '/content/gdrive/MyDrive/master_thesis/corpus_2/cp2_cleaned.csv'

cp1_clean = pd.read_csv(cp1_clean_path)
cp2_clean = pd.read_csv(cp2_clean_path)

corpus_combined = pd.concat([cp1_clean, cp2_clean], ignore_index=True, sort=False)

#'/content/gdrive/MyDrive/master_thesis/Old corpora/test.csv'
test_df = pd.read_csv('/content/gdrive/MyDrive/master_thesis/test_docs.csv')
test_vals = test_df['0'].values
train_vals = corpus_combined[~corpus_combined['stream'].isin(test_vals)]['stream'].unique()

In [None]:
print(len(train_vals),len(test_vals))

140 25


In [None]:
gold_std_test = {id: length_list_to_bin(vals) for id,vals in cp_combined_gs.items() if id in test_vals}

## Classification results

In [None]:
from tensorflow.keras.models import load_model, Model
image_model = load_model('/content/gdrive/MyDrive/master_thesis/exp2_single-page_repeat-00.hdf5')

#new: '/content/gdrive/MyDrive/master_thesis/corpus_1/all_new_models/new_model_test_19.hdf5'
#prev: exp2_single-page_repeat-00

#select only top end of model as we already have the vectors precomputed
layer_name = 'dense'
from_vector_model= Model(inputs=image_model.get_layer(layer_name).output, outputs=image_model.output)
from_vector_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 512)]             0         
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 512)               0         
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 131,585
Trainable params: 131,585
Non-trainab

In [None]:
prediction_json_corp1 = {}

for doc_id in tqdm(list(test_vals)):
    repl_doc_id = doc_id.replace('__concatenated','')
    gold_std = cp_combined_gs[repl_doc_id]
    n_docs = len(gold_std)
    n_pages = sum(gold_std)
    
    if repl_doc_id in cp1_gs:
      corpus = 1
    else:
      corpus = 2
    new_vectors = '/content/gdrive/MyDrive/master_thesis/corpus_{}/image_vectors2'.format(corpus)

    new_path = os.path.join(new_vectors,repl_doc_id + '.pickle')

    with open(new_path, 'rb') as f:
      im_page_vectors_new= pickle.load(f)
      f.close()
    preds_new =from_vector_model.predict(im_page_vectors_new)
    prediction_json_corp1[repl_doc_id] = preds_new


  0%|          | 0/25 [00:00<?, ?it/s]



  4%|▍         | 1/25 [00:00<00:02,  9.76it/s]



 16%|█▌        | 4/25 [00:00<00:01, 16.38it/s]



 24%|██▍       | 6/25 [00:00<00:01, 17.25it/s]



 36%|███▌      | 9/25 [00:00<00:00, 18.24it/s]



 44%|████▍     | 11/25 [00:00<00:00, 17.76it/s]



 52%|█████▏    | 13/25 [00:00<00:00, 16.95it/s]



 60%|██████    | 15/25 [00:00<00:00, 15.92it/s]



 68%|██████▊   | 17/25 [00:01<00:00, 15.43it/s]



 76%|███████▌  | 19/25 [00:01<00:00, 15.75it/s]



 84%|████████▍ | 21/25 [00:01<00:00, 15.83it/s]



 92%|█████████▏| 23/25 [00:01<00:00, 15.92it/s]



100%|██████████| 25/25 [00:01<00:00, 16.06it/s]


In [None]:
def set_first_value1(results):
  results[0] = 1
  return results

In [None]:
# Get labels by simply rounding the prediction score (normal method)
prediction_normal_corp1 ={id: np.round(vals.flatten()) for id,vals in prediction_json_corp1.items()}
# Select N (number of docs) highest prediction scores (topN classification method)
# Also always make the first page 1 since that also happens for clustering automatically
prediction_topn_corp1 ={id: set_first_value1(vals.flatten()) for id,vals in prediction_json_corp1.items()}
prediction_topn_corp1 ={id: select_topn(vals,len(cp_combined_gs[id])) for id,vals in prediction_topn_corp1.items()}
# Get prediction scores
prediction_json_features = {id: list(np.round(vals.flatten(),3)) for id,vals in prediction_json_corp1.items()}

#Get binary gold standard
gold_std_test = {id: length_list_to_bin(vals) for id,vals in cp_combined_gs.items() if id in prediction_json_corp1 }

In [None]:
min2 = True
if min2:
  prediction_json_corp1 = {stream : preds for stream,preds in prediction_json_corp1.items() if sum(gold_std_test[stream]) > 1}
  prediction_topn_corp1= {stream : preds for stream,preds in prediction_topn_corp1.items() if sum(gold_std_test[stream]) > 1}
  gold_std_test = {id: vals for id,vals in gold_std_test.items() if id in prediction_json_corp1 }

In [None]:
normal_scores_corp1 = calculate_mean_scores(gold_std_test,prediction_normal_corp1)
topn_scores_corp1 = calculate_mean_scores(gold_std_test,prediction_topn_corp1)
normal_scores_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.85,0.56,0.65,1118,0.83-0.87,0.53-0.59,0.62-0.68
Bcubed,0.66,0.97,0.71,1118,0.63-0.69,0.96-0.98,0.68-0.74
WindowDiff,0.62,0.62,0.62,1118,0.59-0.65,0.59-0.65,0.59-0.65
Block,0.52,0.41,0.45,1118,0.49-0.55,0.38-0.44,0.42-0.48
Weighted Block,0.61,0.45,0.5,1118,0.58-0.64,0.42-0.48,0.47-0.53


In [None]:
topn_scores_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.73,0.73,0.73,1118,0.7-0.76,0.7-0.76,0.7-0.76
Bcubed,0.84,0.84,0.78,1118,0.82-0.86,0.82-0.86,0.76-0.8
WindowDiff,0.63,0.63,0.63,1118,0.6-0.66,0.6-0.66,0.6-0.66
Block,0.54,0.54,0.54,1118,0.51-0.57,0.51-0.57,0.51-0.57
Weighted Block,0.62,0.62,0.62,1118,0.59-0.65,0.59-0.65,0.59-0.65


## Clustering results

In [None]:
used_corpus = corpus_combined
used_standard = cp_combined_gs

prediction_json_im_noswitch_corp1 = {}
prediction_json_im_switch_corp1 = {}
prediction_json_im_switch_start_corp1 = {}

dist_lists_noswitch_corp1 ={}
dist_lists_switch_corp1 ={}
dist_lists_switch_start_corp1 ={}


for doc_id, content in tqdm(used_corpus.groupby('stream')):
  if doc_id in test_vals:
    repl_doc_id = doc_id.replace('__concatenated','')
    corpus = content['corpus'].iloc[0]
    gold_std = used_standard[repl_doc_id]
    n_docs = len(gold_std)
    n_pages = sum(gold_std)
    corpus_im_folder = '/content/gdrive/MyDrive/master_thesis/corpus_{}/image_vectors2'.format(corpus)
    im_page_vectors_path = os.path.join(corpus_im_folder,doc_id+'.pickle')

    with open(im_page_vectors_path, 'rb') as f:
      im_page_vectors= pickle.load(f)
      f.close()

    if n_pages > 1:
      ## Using switch
      dist_list_switch, preds_switch = cluster_with_switch(gold_std,im_page_vectors, False)
      dist_list_switch_start, preds_switch_start = cluster_with_switch(gold_std,im_page_vectors)

      prediction_json_im_switch_corp1[repl_doc_id] = preds_switch
      dist_lists_switch_corp1[repl_doc_id] = dist_list_switch

      prediction_json_im_switch_start_corp1[repl_doc_id] = preds_switch_start
      dist_lists_switch_start_corp1[repl_doc_id] = dist_list_switch_start


      ## Not using switch
      dist_list = []
      c_mat = page_con_matrix(n_pages)
      for i in range(len(im_page_vectors)-1):
        current_vector = im_page_vectors[i]
        next_vector = im_page_vectors[i+1]
        dist = distance.cosine(current_vector, next_vector)
        dist_list.append(dist)
      dist_list = np.array(dist_list)
      if len(dist_list) >1:
        dist_list_norm = (dist_list - np.min(dist_list)) / (np.max(dist_list) - np.min(dist_list))
        nth_highest = np.sort(dist_list_norm)[-n_docs]
      else:
        dist_list_norm = dist_list

      
      dist_lists_noswitch_corp1[repl_doc_id] = dist_list_norm

      if n_pages >1:
        cluster = AgglomerativeClustering(n_clusters=n_docs, affinity='cosine', linkage='average',compute_distances = True, connectivity = c_mat)  
        image_predictions = cluster.fit_predict(im_page_vectors) 


        prediction_json_im_noswitch_corp1[repl_doc_id] = length_list_to_bin(groups_to_lengths(image_predictions))

100%|██████████| 165/165 [00:01<00:00, 117.31it/s]


In [None]:
min2 = True
if min2:
  prediction_json_im_noswitch_corp1 = {stream : preds for stream,preds in prediction_json_im_noswitch_corp1.items() if len(cp_combined_gs[stream]) > 1}
  prediction_json_im_switch_corp1= {stream : preds for stream,preds in prediction_json_im_switch_corp1.items() if len(cp_combined_gs[stream]) > 1}
  prediction_json_im_switch_start_corp1= {stream : preds for stream,preds in prediction_json_im_switch_start_corp1.items() if len(cp_combined_gs[stream]) > 1}
  gold_std_test = {id: length_list_to_bin(vals) for id,vals in cp_combined_gs.items() if id in prediction_json_im_noswitch_corp1 }

In [None]:

noswitch_scores_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_noswitch_corp1)
switch_scores_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_switch_corp1)
switch_start_scores_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_switch_start_corp1)
noswitch_scores_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.51,0.51,0.51,1118,0.48-0.54,0.48-0.54,0.48-0.54
Bcubed,0.72,0.78,0.65,1118,0.69-0.75,0.76-0.8,0.62-0.68
WindowDiff,0.46,0.46,0.46,1118,0.43-0.49,0.43-0.49,0.43-0.49
Block,0.22,0.22,0.22,1118,0.2-0.24,0.2-0.24,0.2-0.24
Weighted Block,0.31,0.31,0.31,1118,0.28-0.34,0.28-0.34,0.28-0.34


In [None]:
switch_scores_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.54,0.54,0.54,1118,0.51-0.57,0.51-0.57,0.51-0.57
Bcubed,0.78,0.77,0.7,1118,0.76-0.8,0.75-0.79,0.67-0.73
WindowDiff,0.52,0.52,0.52,1118,0.49-0.55,0.49-0.55,0.49-0.55
Block,0.31,0.31,0.31,1118,0.28-0.34,0.28-0.34,0.28-0.34
Weighted Block,0.41,0.41,0.41,1118,0.38-0.44,0.38-0.44,0.38-0.44


In [None]:
switch_start_scores_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.58,0.58,0.58,1118,0.55-0.61,0.55-0.61,0.55-0.61
Bcubed,0.79,0.8,0.71,1118,0.77-0.81,0.78-0.82,0.68-0.74
WindowDiff,0.55,0.55,0.55,1118,0.52-0.58,0.52-0.58,0.52-0.58
Block,0.36,0.36,0.36,1118,0.33-0.39,0.33-0.39,0.33-0.39
Weighted Block,0.45,0.45,0.45,1118,0.42-0.48,0.42-0.48,0.42-0.48


### VGG Clustering Results

In [None]:
used_corpus = corpus_combined
used_standard = cp_combined_gs

prediction_json_im_noswitch_VGG_corp1 = {}
prediction_json_im_switch_VGG_corp1 = {}
prediction_json_im_switch_start_VGG_corp1 = {}

dist_lists_noswitch_VGG_corp1 ={}
dist_lists_switch_VGG_corp1 ={}
dist_lists_switch_start_VGG_corp1 ={}


for doc_id, content in tqdm(used_corpus.groupby('stream')):
  if doc_id in test_vals:
    repl_doc_id = doc_id.replace('__concatenated','')
    corpus = content['corpus'].iloc[0]
    gold_std = used_standard[repl_doc_id]
    n_docs = len(gold_std)
    n_pages = sum(gold_std)
    corpus_im_folder = '/content/gdrive/MyDrive/master_thesis/Old corpora/VGG_vectors'.format(corpus+1)
    
    im_page_vectors_path = os.path.join(corpus_im_folder,doc_id+'.npy')
    if not os.path.exists(im_page_vectors_path):
      continue
    im_page_vectors = np.load(im_page_vectors_path)

    if n_pages > 1:
      ## Using switch
      dist_list_switch, preds_switch = cluster_with_switch(gold_std,im_page_vectors, False)
      dist_list_switch_start, preds_switch_start = cluster_with_switch(gold_std,im_page_vectors)

      prediction_json_im_switch_VGG_corp1[repl_doc_id] = preds_switch
      dist_lists_switch_VGG_corp1[repl_doc_id] = dist_list_switch

      prediction_json_im_switch_start_VGG_corp1[repl_doc_id] = preds_switch_start
      dist_lists_switch_start_VGG_corp1[repl_doc_id] = dist_list_switch_start


      ## Not using switch
      dist_list = []
      c_mat = page_con_matrix(n_pages)
      for i in range(len(im_page_vectors)-1):
        current_vector = im_page_vectors[i]
        next_vector = im_page_vectors[i+1]
        dist = distance.cosine(current_vector, next_vector)
        dist_list.append(dist)
      dist_list = np.array(dist_list)
      if len(dist_list) >1:
        dist_list_norm = (dist_list - np.min(dist_list)) / (np.max(dist_list) - np.min(dist_list))
        nth_highest = np.sort(dist_list_norm)[-n_docs]
      else:
        dist_list_norm = dist_list

      
      dist_lists_noswitch_VGG_corp1[repl_doc_id] = dist_list_norm

      if n_pages >1:
        cluster = AgglomerativeClustering(n_clusters=n_docs, affinity='cosine', linkage='average',compute_distances = True, connectivity = c_mat)  
        image_predictions = cluster.fit_predict(im_page_vectors) 


        prediction_json_im_noswitch_VGG_corp1[repl_doc_id] = length_list_to_bin(groups_to_lengths(image_predictions))

100%|██████████| 165/165 [00:02<00:00, 61.69it/s]


In [None]:
min2 = True
if min2:
  prediction_json_im_noswitch_VGG_corp1 = {stream : preds for stream,preds in prediction_json_im_noswitch_VGG_corp1.items() if len(cp_combined_gs[stream]) > 1}
  prediction_json_im_switch_VGG_corp1= {stream : preds for stream,preds in prediction_json_im_switch_VGG_corp1.items() if len(cp_combined_gs[stream]) > 1}
  prediction_json_im_switch_start_VGG_corp1= {stream : preds for stream,preds in prediction_json_im_switch_start_VGG_corp1.items() if len(cp_combined_gs[stream]) > 1}
  gold_std_test = {id: length_list_to_bin(vals) for id,vals in cp_combined_gs.items() if id in prediction_json_im_noswitch_corp1 }

In [None]:
noswitch_scores_VGG_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_noswitch_VGG_corp1)
switch_scores_VGG_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_switch_VGG_corp1)
switch_start_scores_VGG_corp1 = calculate_mean_scores(gold_std_test,prediction_json_im_switch_start_VGG_corp1)
noswitch_scores_VGG_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.48,0.48,0.48,1118,0.45-0.51,0.45-0.51,0.45-0.51
Bcubed,0.72,0.75,0.62,1118,0.69-0.75,0.72-0.78,0.59-0.65
WindowDiff,0.39,0.39,0.39,1118,0.36-0.42,0.36-0.42,0.36-0.42
Block,0.19,0.19,0.19,1118,0.17-0.21,0.17-0.21,0.17-0.21
Weighted Block,0.29,0.29,0.29,1118,0.26-0.32,0.26-0.32,0.26-0.32


In [None]:
switch_scores_VGG_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.45,0.45,0.45,1118,0.42-0.48,0.42-0.48,0.42-0.48
Bcubed,0.74,0.72,0.62,1118,0.71-0.77,0.69-0.75,0.59-0.65
WindowDiff,0.43,0.43,0.43,1118,0.4-0.46,0.4-0.46,0.4-0.46
Block,0.22,0.22,0.22,1118,0.2-0.24,0.2-0.24,0.2-0.24
Weighted Block,0.33,0.33,0.33,1118,0.3-0.36,0.3-0.36,0.3-0.36


In [None]:
switch_start_scores_VGG_corp1

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.44,0.44,0.44,1118,0.41-0.47,0.41-0.47,0.41-0.47
Bcubed,0.73,0.72,0.62,1118,0.7-0.76,0.69-0.75,0.59-0.65
WindowDiff,0.42,0.42,0.42,1118,0.39-0.45,0.39-0.45,0.39-0.45
Block,0.22,0.22,0.22,1118,0.2-0.24,0.2-0.24,0.2-0.24
Weighted Block,0.32,0.32,0.32,1118,0.29-0.35,0.29-0.35,0.29-0.35


## Looking into difference between classifier and clustering

In [None]:
def sum_metrics(stream_results, metric_list = ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']):
  f1_res = stream_results['F1']
  tot_metric_score = 0
  for met,val in f1_res.items():
    if met in metric_list:
      tot_metric_score += val
  return tot_metric_score

In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification
gold_std_test = {id: length_list_to_bin(vals) for id,vals in cp_combined_gs.items() if id in prediction_json_im_switch_start_corp1 }
corp1_results_individual = {}
for id, switch_start_pred in prediction_json_im_switch_start_corp1.items():
  stream_gs = gold_std_test[id]
  noswitch_preds = prediction_json_im_noswitch_corp1[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,switch_start_pred),METRIC_LIST)

  corp1_results_individual[id] = cluss_tot_metric

In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification
corp1_results_individual_VGG = {}
for id, noswitch_pred in prediction_json_im_noswitch_VGG_corp1.items():
  stream_gs = gold_std_test[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,noswitch_pred),METRIC_LIST)

  corp1_results_individual_VGG[id] = cluss_tot_metric

In [None]:
## We see that using switch clustering can lead to some small improvements in 3 
## streams, but overall lowers the results. In some cases even drastically

print('Scores of clustering using VGG vectors')
print()
for stream_name, score in Counter(corp1_results_individual_VGG).most_common():
  print('{} has length {}, {} docs and score {}'.format(stream_name,len(gold_std_test[stream_name]),sum(gold_std_test[stream_name]),np.round(score,3)))

Scores of clustering using VGG vectors

967487_wob-verzoek.zip has length 2, 2.0 docs and score 5.0
965575_files.zip has length 117, 106.0 docs and score 4.082
Wob-besluit-inzake-bestemmingsplan-Gaanderen-2021-009749 has length 8, 7.0 docs and score 3.583
Wob-besluit-herindeling-Scherpenzeel-2e-tranche.html has length 198, 117.0 docs and score 2.586
963361 has length 8, 4.0 docs and score 2.387
967494_files.zip has length 105, 31.0 docs and score 2.329
969284_mails_incl_bijlagen_7.zip has length 347, 137.0 docs and score 1.732
Wob-besluit-bestemmingsplan-Drielanden-Small-Smart-Houses.html has length 20, 12.0 docs and score 1.714
Tweede-deelbesluit-Wob-verzoek-beslaglegging-Archeodienst-aanvulling.html has length 669, 257.0 docs and score 1.637
890726 has length 238, 21.0 docs and score 1.605
963262_files.zip has length 34, 11.0 docs and score 1.582
963574_files_1.zip has length 39, 7.0 docs and score 1.495
967351_files_3.zip has length 134, 15.0 docs and score 1.373
894727 has length 3

In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification
diff_list= {id : prediction_json_im_switch_start_corp1[id] - prediction_topn_corp1[id] for id in prediction_json_im_switch_start_corp1}

# Find docs where clustering predicts 1 but classification predicts 0
diff_counts_cluster = Counter({id : Counter(vals)[1] for id,vals in diff_list.items()})

# Find docs where classification predicts 1 but clustering predicts 0
diff_counts_class = Counter({id : Counter(vals)[-1] for id,vals in diff_list.items()})

# See difference in total metric score beteween clustering and classifation
metric_diff = {}
for id, switch_start_pred in prediction_json_im_switch_start_corp1.items():
  stream_gs = gold_std_test[id]
  class_preds = prediction_topn_corp1[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,switch_start_pred),METRIC_LIST)
  class_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,class_preds),METRIC_LIST)

  metric_diff[id] = cluss_tot_metric - class_tot_metric


switch_diff = {}
for id, switch_start_pred in prediction_json_im_switch_start_corp1.items():
  stream_gs = gold_std_test[id]
  noswitch_preds = prediction_json_im_noswitch_corp1[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,switch_start_pred),METRIC_LIST)
  noswitch_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,noswitch_preds),METRIC_LIST)

  switch_diff[id] = cluss_tot_metric - noswitch_tot_metric



In [None]:
## We see that using switch clustering can lead to some small improvements in 3 
## streams, but overall lowers the results. In some cases even drastically

print('Difference between switch clustering and classification')
print()
for stream_name, score in Counter(metric_diff).most_common():
  print('{} has length {} and score {}'.format(stream_name,len(gold_std_test[stream_name]),np.round(score,3)))

Difference between switch clustering and classification

963574_files_1.zip has length 39 and score 0.258
902510 has length 211 and score 0.159
890726 has length 238 and score 0.064
963361 has length 8 and score 0.0
967487_wob-verzoek.zip has length 2 and score 0.0
967351_files_3.zip has length 134 and score -0.022
965575_files.zip has length 117 and score -0.16
Wob-besluit-inzake-bestemmingsplan-Gaanderen-2021-009749 has length 8 and score -0.307
963310_files.zip has length 483 and score -0.43
967351_files_1.zip has length 641 and score -0.454
967494_files.zip has length 105 and score -0.541
963262_files.zip has length 34 and score -0.742
Wob-besluit-Small-Smart-Houses-Horloseweg-2021-009836.html has length 575 and score -0.848
Tweede-deelbesluit-Wob-verzoek-beslaglegging-Archeodienst-aanvulling.html has length 669 and score -1.003
969284_mails_incl_bijlagen_7.zip has length 347 and score -1.272
Wob-besluit-herindeling-Scherpenzeel-2e-tranche.html has length 198 and score -1.322
96735

In [None]:
## We can see that using the switch increases the results in almost all scenarios
print('Difference between switch clustering and no switch clustering')
print()
for stream_name, score in Counter(switch_diff).most_common():
  print('{} has length {} and score {}'.format(stream_name,len(gold_std_test[stream_name]),np.round(score,3)))

Difference between switch clustering and no switch clustering

963361 has length 8 and score 2.97
Tweede-deelbesluit-Wob-verzoek-beslaglegging-Archeodienst-aanvulling.html has length 669 and score 1.301
969284_mails_incl_bijlagen_7.zip has length 347 and score 1.239
Wob-besluit-bestemmingsplan-Drielanden-Small-Smart-Houses.html has length 20 and score 1.071
967494_files.zip has length 105 and score 0.815
963262_files.zip has length 34 and score 0.697
963310_files.zip has length 483 and score 0.67
965575_files.zip has length 117 and score 0.499
902510 has length 211 and score 0.434
Wob-besluit-herindeling-Scherpenzeel-2e-tranche.html has length 198 and score 0.433
967351_files_2.zip has length 493 and score 0.329
894727 has length 37 and score 0.16
890726 has length 238 and score 0.137
967351_files_1.zip has length 641 and score 0.097
967351_files_3.zip has length 134 and score 0.06
967487_wob-verzoek.zip has length 2 and score 0.0
Wob-besluit-Small-Smart-Houses-Horloseweg-2021-009836.h

In [None]:
## select documents to have a closer look into

OUT_FOLDER = '/content/gdrive/MyDrive/master_thesis/analysis_streams_corp1'
SAVE = True

if False:
  difference_dfs = []
  for stream_name, score in Counter(metric_diff).most_common():
    ## select streams that differ and have a relatively short length to look into
    if score != 0 and len(gold_std_test[stream_name]) < 70:
      out_df = pd.DataFrame()
      out_df['switch_preds'] = prediction_json_im_switch_start[stream_name]
      out_df['switch_dist'] = list(np.round(dist_lists_switch_start[stream_name],3)) + [0]
      out_df['CNN_pred_normal'] = prediction_normal[stream_name]
      out_df['CNN_pred_topn'] = prediction_topn[stream_name]
      out_df['CNN_perc'] = prediction_json_features[stream_name]
      out_df['gs'] = gold_std_test[stream_name]

      difference_dfs.append((stream_name,out_df))
      if SAVE:
        out_df_path = os.path.join(OUT_FOLDER,stream_name + '_{}.csv'.format(np.round(score,3)))
        out_df.to_csv(out_df_path,index = False)


# New Corpora

In [None]:
import json
import pandas as pd
import numpy as np

test_text_finetuned_vectors = np.load('/content/gdrive/MyDrive/master_thesis/new_corpus/test/text/finetuned/vectors.npy')
new_image_vectors = np.load('/content/gdrive/MyDrive/master_thesis/new_corpus/vectors.npy')
vector_dict_path = '/content/gdrive/MyDrive/master_thesis/new_corpus/test/vector_dict.json'
test_data_path = '/content/gdrive/MyDrive/master_thesis/new_corpus/test/data_removed.csv'

self_corp1_image = '/content/gdrive/MyDrive/master_thesis/new_corpus/image_vectors'


with open(vector_dict_path) as j_obj:
  vector_dict = json.load(j_obj)
  j_obj.close()

test_data = pd.read_csv(test_data_path)

In [None]:
gold_std_dict = {}

for doc_id,content in test_data.groupby('name'):
    repl_doc_id = doc_id.replace('__concatenated','')
    content = content.sort_values(by='page')
    gold_std = content['label'].values
    gold_std_dict[repl_doc_id] = gold_std

## Classification result

In [None]:
from tensorflow.keras.models import load_model, Model
image_model = load_model('/content/gdrive/MyDrive/master_thesis/new_corpus/image')

#select only top end of model as we already have the vectors precomputed
layer_name = 'dense'
from_vector_model= Model(inputs=image_model.get_layer(layer_name).output, outputs=image_model.output)
from_vector_model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 512)               0         
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 131,585
Trainable params: 131,585
Non-trainable

In [None]:
from tqdm import tqdm
import os
prediction_json_corp2 = {}
gold_std_dict = {}

not_found = []
vector_path = '/content/gdrive/MyDrive/master_thesis/new_corpus/image_vectors'
for doc_id,content in tqdm(test_data.groupby('name')):
    repl_doc_id = doc_id.replace('__concatenated','')
    content = content.sort_values(by='page')
    gold_std = content['label'].values
    gold_std_dict[repl_doc_id] = gold_std
    n_docs = len(gold_std)
    n_pages = sum(gold_std)
    stream_path = os.path.join(vector_path,repl_doc_id + '.npy')
    
    if os.path.exists(stream_path):
      stream_vectors = np.load(stream_path)
      preds_new =from_vector_model.predict(stream_vectors)
      prediction_json_corp2[repl_doc_id] = preds_new
    else:
      not_found.append(repl_doc_id)


  0%|          | 0/32 [00:00<?, ?it/s]



  3%|▎         | 1/32 [00:00<00:30,  1.03it/s]



  6%|▋         | 2/32 [00:01<00:22,  1.36it/s]



  9%|▉         | 3/32 [00:02<00:18,  1.60it/s]



 12%|█▎        | 4/32 [00:02<00:17,  1.63it/s]



 16%|█▌        | 5/32 [00:02<00:13,  1.93it/s]



 19%|█▉        | 6/32 [00:03<00:13,  1.93it/s]



 22%|██▏       | 7/32 [00:04<00:13,  1.86it/s]



 25%|██▌       | 8/32 [00:04<00:12,  1.87it/s]



 28%|██▊       | 9/32 [00:05<00:12,  1.87it/s]



 31%|███▏      | 10/32 [00:05<00:10,  2.05it/s]



 34%|███▍      | 11/32 [00:06<00:12,  1.66it/s]



 38%|███▊      | 12/32 [00:06<00:10,  1.83it/s]



 41%|████      | 13/32 [00:07<00:09,  2.07it/s]



 44%|████▍     | 14/32 [00:07<00:08,  2.12it/s]



 47%|████▋     | 15/32 [00:08<00:07,  2.19it/s]



 50%|█████     | 16/32 [00:08<00:08,  1.94it/s]



 53%|█████▎    | 17/32 [00:10<00:15,  1.06s/it]



 56%|█████▋    | 18/32 [00:11<00:11,  1.18it/s]



 59%|█████▉    | 19/32 [00:12<00:10,  1.25it/s]



 62%|██████▎   | 20/32 [00:12<00:08,  1.46it/s]



 66%|██████▌   | 21/32 [00:13<00:07,  1.46it/s]



 69%|██████▉   | 22/32 [00:14<00:08,  1.24it/s]



 72%|███████▏  | 23/32 [00:14<00:06,  1.40it/s]



 75%|███████▌  | 24/32 [00:15<00:05,  1.57it/s]



 78%|███████▊  | 25/32 [00:15<00:04,  1.67it/s]



 81%|████████▏ | 26/32 [00:16<00:03,  1.68it/s]



 84%|████████▍ | 27/32 [00:17<00:03,  1.49it/s]



 88%|████████▊ | 28/32 [00:18<00:03,  1.24it/s]



 91%|█████████ | 29/32 [00:19<00:02,  1.17it/s]



 94%|█████████▍| 30/32 [00:20<00:01,  1.06it/s]



 97%|█████████▋| 31/32 [00:21<00:00,  1.01it/s]



100%|██████████| 32/32 [00:22<00:00,  1.45it/s]


In [None]:
def set_first_value1(results):
  results[0] = 1
  return results

In [None]:
# Get labels by simply rounding the prediction score (normal method)
prediction_normal_corp2 ={id: np.round(vals.flatten()) for id,vals in prediction_json_corp2.items()}
#set to different threshold, other corpus showed that t-0.3 yields the highest results
prediction_normal_corp2_t ={id: round_threshold(vals.flatten(), 0.3) for id,vals in prediction_json_corp2.items()}

# Select N (number of docs) highest prediction scores (topN classification method)
# Also always make the first page 1 since that also happens for clustering automatically
prediction_topn_corp2 ={id: set_first_value1(vals.flatten()) for id,vals in prediction_json_corp2.items()}
prediction_topn_corp2 ={id: select_topn(vals,sum(gold_std_dict[id])) for id,vals in prediction_topn_corp2.items()}
# Get prediction scores
prediction_json_features = {id: list(np.round(vals.flatten(),3)) for id,vals in prediction_json_corp2.items()}

gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_normal_corp2}

In [None]:
min2 = True
if min2:
  prediction_normal_corp2 = {stream : preds for stream,preds in prediction_normal_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_normal_corp2_t ={stream : preds for stream,preds in prediction_normal_corp2_t.items() if sum(gold_std_dict[stream]) > 1}
  prediction_topn_corp2= {stream : preds for stream,preds in prediction_topn_corp2.items() if sum(gold_std_dict[stream]) > 1}
  gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_normal_corp2 }

In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification

class_dif = {}
for id, pred in prediction_topn_corp2.items():
  stream_gs = gold_std_dict[id]
  
  class_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,pred),METRIC_LIST)

  class_dif[id] = class_tot_metric

In [None]:
## We can see that using the switch increases the results in almost all scenarios
print('Results from classification, best to worst')
print()
for stream_name, score in Counter(class_dif).most_common():
  print('{} has a total of {} pages, {} documents and a score of {}'.format(stream_name,len(gold_std_dict[stream_name]),sum(gold_std_dict[stream_name]),np.round(score,3)))

Results from classification, best to worst

8fa815c695a9811628e76705e839dcbc_deels-openbare-documenten has a total of 36 pages, 27 documents and a score of 5.0
dc034afbaede3d587451c7062fd857e7_bijlage-c1-openbaar-te-maken-documenten-eu has a total of 57 pages, 24 documents and a score of 5.0
ff9481e357a59c506fc16db2aab2411a has a total of 72 pages, 14 documents and a score of 5.0
stream_43 has a total of 7 pages, 3 documents and a score of 5.0
stream_45 has a total of 180 pages, 40 documents and a score of 5.0
stream_9 has a total of 72 pages, 14 documents and a score of 5.0
stream_37 has a total of 5187 pages, 696 documents and a score of 4.959
stream_44 has a total of 222 pages, 97 documents and a score of 4.894
stream_35 has a total of 1103 pages, 500 documents and a score of 4.766
stream_26 has a total of 464 pages, 212 documents and a score of 4.681
stream_2 has a total of 103 pages, 66 documents and a score of 4.671
stream_74 has a total of 1163 pages, 400 documents and a score o

In [None]:
normal_scores_corp2 = calculate_mean_scores(gold_std_dict,prediction_normal_corp2)
normal_scores_corp2_t = calculate_mean_scores(gold_std_dict,prediction_normal_corp2_t)
topn_scores_corp2 = calculate_mean_scores(gold_std_dict,prediction_topn_corp2)
normal_scores_corp2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.91,0.84,0.86,6476,0.9-0.92,0.83-0.85,0.85-0.87
Bcubed,0.88,0.95,0.88,6476,0.87-0.89,0.94-0.96,0.87-0.89
WindowDiff,0.76,0.76,0.76,6476,0.75-0.77,0.75-0.77,0.75-0.77
Block,0.77,0.71,0.73,6476,0.76-0.78,0.7-0.72,0.72-0.74
Weighted Block,0.72,0.66,0.68,6476,0.71-0.73,0.65-0.67,0.67-0.69


In [None]:
normal_scores_corp2_t

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.9,0.86,0.87,6476,0.89-0.91,0.85-0.87,0.86-0.88
Bcubed,0.9,0.94,0.89,6476,0.89-0.91,0.93-0.95,0.88-0.9
WindowDiff,0.77,0.77,0.77,6476,0.76-0.78,0.76-0.78,0.76-0.78
Block,0.76,0.72,0.73,6476,0.75-0.77,0.71-0.73,0.72-0.74
Weighted Block,0.71,0.68,0.69,6476,0.7-0.72,0.67-0.69,0.68-0.7


In [None]:
topn_scores_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.92,0.92,0.92,6476,0.91-0.93,0.91-0.93,0.91-0.93
Bcubed,0.94,0.93,0.91,6476,0.93-0.95,0.92-0.94,0.9-0.92
WindowDiff,0.86,0.86,0.86,6476,0.85-0.87,0.85-0.87,0.85-0.87
Block,0.82,0.82,0.82,6476,0.81-0.83,0.81-0.83,0.81-0.83
Weighted Block,0.87,0.87,0.87,6476,0.86-0.88,0.86-0.88,0.86-0.88


#### Text Results
I have already done the predictions elsewhere, as the language model is so large


In [None]:
from tqdm import tqdm
import os
prediction_json_corp2_text = {}

not_found = []
vector_path = '/content/gdrive/MyDrive/master_thesis/new_corpus/text_predictions'
for doc_id,content in tqdm(test_data.groupby('name')):
    repl_doc_id = doc_id.replace('__concatenated','')
    content = content.sort_values(by='page')
    gold_std = content['label'].values
    gold_std_dict[repl_doc_id] = gold_std
    n_docs = len(gold_std)
    n_pages = sum(gold_std)
    stream_path = os.path.join(vector_path,repl_doc_id + '.npy')
    
    if os.path.exists(stream_path):
      preds = np.load(stream_path)
      prediction_json_corp2_text[repl_doc_id] = preds
    else:
      not_found.append(repl_doc_id)


100%|██████████| 32/32 [00:10<00:00,  3.17it/s]


In [None]:
def set_first_value1(results):
  results[0] = 1
  return results

In [None]:
# Get labels by simply rounding the prediction score (normal method)
prediction_normal_text_corp2 ={id: np.round(vals.flatten()) for id,vals in prediction_json_corp2_text.items()}
#set to different threshold, other corpus showed that t-0.3 yields the highest results
prediction_normal_text_corp2_t ={id: round_threshold(vals.flatten(), 0.3) for id,vals in prediction_json_corp2_text.items()}

# Select N (number of docs) highest prediction scores (topN classification method)
# Also always make the first page 1 since that also happens for clustering automatically
prediction_topn_text_corp2 ={id: set_first_value1(vals.flatten()) for id,vals in prediction_json_corp2_text.items()}
prediction_topn_text_corp2 ={id: select_topn(vals,sum(gold_std_dict[id])) for id,vals in prediction_topn_text_corp2.items()}
# Get prediction scores
prediction_json_features_text = {id: list(np.round(vals.flatten(),3)) for id,vals in prediction_json_corp2_text.items()}

gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_json_corp2_text}

In [None]:
min2 = True
if min2:
  prediction_normal_text_corp2 = {stream : preds for stream,preds in prediction_normal_text_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_normal_text_corp2_t ={stream : preds for stream,preds in prediction_normal_text_corp2_t.items() if sum(gold_std_dict[stream]) > 1}
  prediction_topn_text_corp2= {stream : preds for stream,preds in prediction_topn_text_corp2.items() if sum(gold_std_dict[stream]) > 1}
  gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_normal_text_corp2 }

KeyError: ignored

In [None]:
normal_scores_text_corp2 = calculate_mean_scores(gold_std_dict,prediction_normal_text_corp2)
normal_scores_text_corp2_t = calculate_mean_scores(gold_std_dict,prediction_normal_text_corp2_t)
topn_scores_text_corp2 = calculate_mean_scores(gold_std_dict,prediction_topn_text_corp2)
normal_scores_text_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.85,0.81,0.83,6525,0.84-0.86,0.8-0.82,0.82-0.84
Bcubed,0.89,0.93,0.88,6525,0.88-0.9,0.92-0.94,0.87-0.89
WindowDiff,0.79,0.79,0.79,6525,0.78-0.8,0.78-0.8,0.78-0.8
Block,0.72,0.69,0.7,6525,0.71-0.73,0.68-0.7,0.69-0.71
Weighted Block,0.72,0.68,0.7,6525,0.71-0.73,0.67-0.69,0.69-0.71


In [None]:
normal_scores_text_corp2_t

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.82,0.83,0.82,6525,0.81-0.83,0.82-0.84,0.81-0.83
Bcubed,0.91,0.89,0.86,6525,0.9-0.92,0.88-0.9,0.85-0.87
WindowDiff,0.76,0.76,0.76,6525,0.75-0.77,0.75-0.77,0.75-0.77
Block,0.69,0.69,0.69,6525,0.68-0.7,0.68-0.7,0.68-0.7
Weighted Block,0.7,0.7,0.7,6525,0.69-0.71,0.69-0.71,0.69-0.71


In [None]:
topn_scores_text_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.86,0.86,0.86,6525,0.85-0.87,0.85-0.87,0.85-0.87
Bcubed,0.92,0.91,0.88,6525,0.91-0.93,0.9-0.92,0.87-0.89
WindowDiff,0.81,0.81,0.81,6525,0.8-0.82,0.8-0.82,0.8-0.82
Block,0.72,0.72,0.72,6525,0.71-0.73,0.71-0.73,0.71-0.73
Weighted Block,0.8,0.8,0.8,6525,0.79-0.81,0.79-0.81,0.79-0.81


## Clustering results

In [None]:
gold_std_dict = {}

for doc_id,content in test_data.groupby('name'):
    repl_doc_id = doc_id.replace('__concatenated','')
    content = content.sort_values(by='page')
    gold_std = content['label'].values
    gold_std_dict[repl_doc_id] = gold_std

In [None]:
from tqdm import tqdm
used_corpus = test_data
text_vectors_used = test_text_finetuned_vectors
image_vectors_used = new_image_vectors
vector_dict_used = vector_dict

prediction_json_im_noswitch_corp2 = {}
prediction_json_im_switch_corp2 = {}
prediction_json_im_switch_start_corp2 = {}

dist_lists_noswitch_corp2 ={}
dist_lists_switch_corp2 ={}
dist_lists_switch_start_corp2 ={}

vector_path = '/content/gdrive/MyDrive/master_thesis/new_corpus/image_vectors'
for doc_id, content in tqdm(used_corpus.groupby('name')):
  repl_doc_id = doc_id.replace('__concatenated','')
  corpus = content['corpus'].iloc[0]

  #text_vectors, labels = get_vectors_for_stream(used_corpus,doc_id,vector_dict_used,text_vectors_used)
  if not os.path.exists(os.path.join(vector_path,repl_doc_id + '.npy')):
    continue
  image_vectors = np.load(os.path.join(vector_path,repl_doc_id + '.npy'))
  labels = gold_std_dict[repl_doc_id]

  n_pages = len(labels)
  n_docs = sum(labels)

  if n_pages > 1:
    ## Using switch
    dist_list_switch, preds_switch = cluster_with_switch(labels,image_vectors, False, True)
    dist_list_switch_start, preds_switch_start = cluster_with_switch(labels,image_vectors, labels_bin = True)

    prediction_json_im_switch_corp2[repl_doc_id] = preds_switch
    dist_lists_switch_corp2[repl_doc_id] = dist_list_switch

    prediction_json_im_switch_start_corp2[repl_doc_id] = preds_switch_start
    dist_lists_switch_start_corp2[repl_doc_id] = dist_list_switch_start


    ## Not using switch
    dist_list = []
    c_mat = page_con_matrix(n_pages)
    for i in range(len(image_vectors)-1):
      current_vector = image_vectors[i]
      next_vector = image_vectors[i+1]
      dist = distance.cosine(current_vector, next_vector)
      dist_list.append(dist)
    dist_list = np.array(dist_list)
    if len(dist_list) >1:
      dist_list_norm = (dist_list - np.min(dist_list)) / (np.max(dist_list) - np.min(dist_list))
      nth_highest = np.sort(dist_list_norm)[-n_docs]
    else:
      dist_list_norm = dist_list

    
    dist_lists_noswitch_corp2[repl_doc_id] = dist_list_norm

    if n_pages >1:
      cluster = AgglomerativeClustering(n_clusters=n_docs, affinity='cosine', linkage='average',compute_distances = True, connectivity = c_mat)  
      image_predictions = cluster.fit_predict(image_vectors) 


      prediction_json_im_noswitch_corp2[repl_doc_id] = length_list_to_bin(groups_to_lengths(image_predictions))

100%|██████████| 32/32 [01:08<00:00,  2.14s/it]


In [None]:
min2 = True
if min2:
  prediction_json_im_noswitch_corp2 = {stream : preds for stream,preds in prediction_json_im_noswitch_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_json_im_switch_corp2 = {stream : preds for stream,preds in prediction_json_im_switch_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_json_im_switch_start_corp2 = {stream : preds for stream,preds in prediction_json_im_switch_start_corp2.items() if sum(gold_std_dict[stream]) > 1}
  gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_json_im_noswitch_corp2 }

In [None]:
gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_json_im_noswitch_corp2 }
noswitch_scores_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_noswitch_corp2)
switch_scores_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_switch_corp2)
switch_start_scores_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_switch_start_corp2)
noswitch_scores_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.54,0.54,0.54,6476,0.53-0.55,0.53-0.55,0.53-0.55
Bcubed,0.71,0.78,0.66,6476,0.7-0.72,0.77-0.79,0.65-0.67
WindowDiff,0.41,0.41,0.41,6476,0.4-0.42,0.4-0.42,0.4-0.42
Block,0.07,0.07,0.07,6476,0.06-0.08,0.06-0.08,0.06-0.08
Weighted Block,0.25,0.25,0.25,6476,0.24-0.26,0.24-0.26,0.24-0.26


In [None]:
switch_scores_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.58,0.58,0.58,6476,0.57-0.59,0.57-0.59,0.57-0.59
Bcubed,0.77,0.81,0.72,6476,0.76-0.78,0.8-0.82,0.71-0.73
WindowDiff,0.59,0.59,0.59,6476,0.58-0.6,0.58-0.6,0.58-0.6
Block,0.37,0.37,0.37,6476,0.36-0.38,0.36-0.38,0.36-0.38
Weighted Block,0.48,0.48,0.48,6476,0.47-0.49,0.47-0.49,0.47-0.49


In [None]:
switch_start_scores_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.61,0.61,0.61,6476,0.6-0.62,0.6-0.62,0.6-0.62
Bcubed,0.78,0.82,0.74,6476,0.77-0.79,0.81-0.83,0.73-0.75
WindowDiff,0.61,0.61,0.61,6476,0.6-0.62,0.6-0.62,0.6-0.62
Block,0.41,0.41,0.41,6476,0.4-0.42,0.4-0.42,0.4-0.42
Weighted Block,0.52,0.52,0.52,6476,0.51-0.53,0.51-0.53,0.51-0.53


### VGG Clustering Results

In [None]:
gold_std_dict = {}

for doc_id,content in test_data.groupby('name'):
    repl_doc_id = doc_id.replace('__concatenated','')
    content = content.sort_values(by='page')
    gold_std = content['label'].values
    gold_std_dict[repl_doc_id] = gold_std

In [None]:
from tqdm import tqdm
used_standard = gold_std_dict
used_corpus = test_data
text_vectors_used = test_text_finetuned_vectors
image_vectors_used = new_image_vectors
vector_dict_used = vector_dict


prediction_json_im_noswitch_VGG_corp2 = {}
prediction_json_im_switch_VGG_corp2 = {}
prediction_json_im_switch_start_VGG_corp2 = {}

dist_lists_noswitch_VGG_corp2 ={}
dist_lists_switch_VGG_corp2 ={}
dist_lists_switch_start_VGG_corp2 ={}

for doc_id, content in tqdm(used_corpus.groupby('name')):
    repl_doc_id = doc_id.replace('__concatenated','')
    corpus = content['corpus'].iloc[0]
    gold_std = used_standard[repl_doc_id]
    n_docs = sum(gold_std)
    n_pages = len(gold_std)
    corpus_im_folder = '/content/gdrive/MyDrive/master_thesis/new_corpus/VGG_vectors'.format(corpus+1)
    
    im_page_vectors_path = os.path.join(corpus_im_folder,doc_id+'.npy')
    if not os.path.exists(im_page_vectors_path):
      continue
    
    im_page_vectors = np.load(im_page_vectors_path)
    if n_pages > 1:
      ## Using switch
      dist_list_switch, preds_switch = cluster_with_switch(gold_std,im_page_vectors, False,True)
      dist_list_switch_start, preds_switch_start = cluster_with_switch(gold_std,im_page_vectors,True,True)

      prediction_json_im_switch_VGG_corp2[repl_doc_id] = preds_switch
      dist_lists_switch_VGG_corp2[repl_doc_id] = dist_list_switch

      prediction_json_im_switch_start_VGG_corp2[repl_doc_id] = preds_switch_start
      dist_lists_switch_start_VGG_corp2[repl_doc_id] = dist_list_switch_start


      ## Not using switch
      dist_list = []
      c_mat = page_con_matrix(n_pages)
      for i in range(len(im_page_vectors)-1):
        current_vector = im_page_vectors[i]
        next_vector = im_page_vectors[i+1]
        dist = distance.cosine(current_vector, next_vector)
        dist_list.append(dist)
      dist_list = np.array(dist_list)
      if len(dist_list) >1:
        dist_list_norm = (dist_list - np.min(dist_list)) / (np.max(dist_list) - np.min(dist_list))
        nth_highest = np.sort(dist_list_norm)[-n_docs]
      else:
        dist_list_norm = dist_list

      
      dist_lists_noswitch_VGG_corp2[repl_doc_id] = dist_list_norm

      if n_pages >1:
        cluster = AgglomerativeClustering(n_clusters=n_docs, affinity='cosine', linkage='average',compute_distances = True, connectivity = c_mat)  
        image_predictions = cluster.fit_predict(im_page_vectors) 


        prediction_json_im_noswitch_VGG_corp2[repl_doc_id] = length_list_to_bin(groups_to_lengths(image_predictions))

100%|██████████| 32/32 [00:51<00:00,  1.62s/it]


In [None]:
min2 = True
if min2:
  prediction_json_im_noswitch_VGG_corp2 = {stream : preds for stream,preds in prediction_json_im_noswitch_VGG_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_json_im_switch_VGG_corp2 = {stream : preds for stream,preds in prediction_json_im_switch_VGG_corp2.items() if sum(gold_std_dict[stream]) > 1}
  prediction_json_im_switch_start_VGG_corp2 = {stream : preds for stream,preds in prediction_json_im_switch_start_VGG_corp2.items() if sum(gold_std_dict[stream]) > 1}
  gold_std_dict = {id: vals for id,vals in gold_std_dict.items() if id in prediction_json_im_noswitch_VGG_corp2 }

In [None]:
gold_std_dict = {x :y for x,y in gold_std_dict.items() if x in prediction_json_im_noswitch_VGG_corp2}
noswitch_scores_VGG_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_noswitch_VGG_corp2)
switch_scores_VGG_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_switch_VGG_corp2)
switch_start_scores_VGG_corp2 = calculate_mean_scores(gold_std_dict,prediction_json_im_switch_start_VGG_corp2)
noswitch_scores_VGG_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.49,0.49,0.49,6476,0.48-0.5,0.48-0.5,0.48-0.5
Bcubed,0.71,0.72,0.6,6476,0.7-0.72,0.71-0.73,0.59-0.61
WindowDiff,0.33,0.33,0.33,6476,0.32-0.34,0.32-0.34,0.32-0.34
Block,0.11,0.11,0.11,6476,0.1-0.12,0.1-0.12,0.1-0.12
Weighted Block,0.24,0.24,0.24,6476,0.23-0.25,0.23-0.25,0.23-0.25


In [None]:
switch_scores_VGG_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.42,0.42,0.42,6476,0.41-0.43,0.41-0.43,0.41-0.43
Bcubed,0.7,0.7,0.59,6476,0.69-0.71,0.69-0.71,0.58-0.6
WindowDiff,0.39,0.39,0.39,6476,0.38-0.4,0.38-0.4,0.38-0.4
Block,0.17,0.17,0.17,6476,0.16-0.18,0.16-0.18,0.16-0.18
Weighted Block,0.29,0.29,0.29,6476,0.28-0.3,0.28-0.3,0.28-0.3


In [None]:
switch_start_scores_VGG_corp2

Unnamed: 0,precision,recall,F1,support,CI Precision,CI Recall,CI F1
Boundary,0.43,0.43,0.43,6476,0.42-0.44,0.42-0.44,0.42-0.44
Bcubed,0.7,0.71,0.59,6476,0.69-0.71,0.7-0.72,0.58-0.6
WindowDiff,0.38,0.38,0.38,6476,0.37-0.39,0.37-0.39,0.37-0.39
Block,0.18,0.18,0.18,6476,0.17-0.19,0.17-0.19,0.17-0.19
Weighted Block,0.29,0.29,0.29,6476,0.28-0.3,0.28-0.3,0.28-0.3


In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification
corp2_results_individual_VGG = {}
for id, noswitch_pred in prediction_json_im_noswitch_VGG_corp2.items():
  stream_gs = gold_std_dict[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,noswitch_pred),METRIC_LIST)

  corp2_results_individual_VGG[id] = cluss_tot_metric

In [None]:
from collections import Counter

METRIC_LIST= ['Boundary','Bcubed','WindowDiff','Block','Weighted Block']
## See where the differences are between best performing clustering model and 
## best performing classification
corp2_results_individual = {}
for id, noswitch_pred in prediction_json_im_switch_start_corp2.items():
  stream_gs = gold_std_dict[id]
  
  cluss_tot_metric = sum_metrics(calculate_metrics_one_stream(stream_gs,noswitch_pred),METRIC_LIST)

  corp2_results_individual[id] = cluss_tot_metric

In [None]:
## We see that using switch clustering can lead to some small improvements in 3 
## streams, but overall lowers the results. In some cases even drastically

print('Scores of clustering using VGG vectors')
print()
for stream_name, score in Counter(corp2_results_individual_VGG).most_common():
  print('{} has length {}, {} docs and score {}'.format(stream_name,len(gold_std_dict[stream_name]),sum(gold_std_dict[stream_name]),np.round(score,3)))

Scores of clustering using VGG vectors

d8d9c5015c9ceb952052f29e1a27ed1f_openbaar-te-maken-documenten-deel-1_3 has length 68, 27 docs and score 2.979
stream_2 has length 103, 66 docs and score 2.275
stream_45 has length 180, 40 docs and score 2.228
8fa815c695a9811628e76705e839dcbc_deels-openbare-documenten has length 36, 27 docs and score 2.204
stream_15 has length 367, 167 docs and score 2.122
ff9481e357a59c506fc16db2aab2411a has length 72, 14 docs and score 1.995
stream_35 has length 1103, 500 docs and score 1.941
dc034afbaede3d587451c7062fd857e7_bijlage-c1-openbaar-te-maken-documenten-eu has length 57, 24 docs and score 1.918
stream_16 has length 50, 2 docs and score 1.882
stream_28 has length 37, 25 docs and score 1.868
stream_68 has length 416, 216 docs and score 1.813
stream_74 has length 1163, 400 docs and score 1.8
stream_79 has length 2013, 500 docs and score 1.756
stream_21 has length 1995, 736 docs and score 1.751
stream_43 has length 7, 3 docs and score 1.742
stream_6 has l

In [None]:
## We see that using switch clustering can lead to some small improvements in 3 
## streams, but overall lowers the results. In some cases even drastically

print('Scores of clustering using finetuned vectors')
print()
for stream_name, score in Counter(corp2_results_individual).most_common():
  print('{} has length {}, {} docs and score {}'.format(stream_name,len(gold_std_dict[stream_name]),sum(gold_std_dict[stream_name]),np.round(score,3)))

Scores of clustering using finetuned vectors

stream_28 has length 37, 25 docs and score 3.874
8fa815c695a9811628e76705e839dcbc_deels-openbare-documenten has length 36, 27 docs and score 3.837
ff9481e357a59c506fc16db2aab2411a has length 72, 14 docs and score 3.61
stream_21 has length 1995, 736 docs and score 3.59
stream_35 has length 1103, 500 docs and score 3.589
stream_48 has length 391, 142 docs and score 3.557
stream_43 has length 7, 3 docs and score 3.368
stream_37 has length 5187, 696 docs and score 3.33
stream_81 has length 2379, 500 docs and score 3.289
stream_15 has length 367, 167 docs and score 3.285
stream_75 has length 2487, 494 docs and score 3.27
stream_80 has length 2664, 485 docs and score 3.233
stream_69 has length 295, 115 docs and score 3.202
stream_9 has length 72, 14 docs and score 3.117
stream_68 has length 416, 216 docs and score 3.094
stream_79 has length 2013, 500 docs and score 3.082
stream_24 has length 639, 209 docs and score 3.072
stream_6 has length 533, 