                                                                                        Patricia Grau Francitorra

# Annotation comparison

Before starting, I downloaded the newspapers annotations and saved them in two folders depending on who was the annotator. The resulting folders are annotator1/ and annotator2/. I created another folder called results/ in which to save the results of the annotations. This folders contains two other folders: different_info_year/ and same_info_year/.

Some changes were made to some files:

- One file in annotator2/ was missing a hyphen.
- Some files in annotator1/ were lowercased.
- There are four files whose values are separated by commas. I changed it to tabs with the linux command tr ',' '\t' < inputfile > output file and erased the old file. The files are:
    - annotator1/WEXJЩBLADET-1824-08-21-0002-GDDHS.csv
    - annotator1/FREJA-1838-05-18-0002-GDDHS.csv
    - annotator1/GЩTEBORGSPOSTEN-1882-08-14-0004-GDDHS.csv
    - annotator1/UMEBLADET-1861-12-14-0004-GDDHS.csv
- One file (KARLSHAMNS-ALLEHANDA-1856-06-23-0001-) had some annotation in the LOW QUALITY INK line. I deleted it for both anntoator1 and annotator2.

In [1]:
import os, sys
from os import listdir

import pandas as pd
import numpy as np

### Checking for files in both directories

In [2]:
def get_files():
    """
    Checks the two directories with the annotations and returns
    information on the files with the same and different names
    in both directories.
    
    Returns:
      - there: a list of all files that are both annotated 
        by annotator1 and annotator2.
      - filenames: a dictionary of the files that are annotated 
        by both people, with the name of the newspaper as the 
        key and the name of the files as the values.
      - missing: a list of the files that are only annotated
        by one annotator.
    """
    dir1 = os.listdir('annotator1')
    dir2 = os.listdir('annotator2')

    ann1 = 'GDDHS.csv'
    ann2 = 'gddea.csv'
    
    unnamed1 = []
    unnamed2 = []

    for a in dir1:
        if a.endswith('.csv'):
            unnamed1.append(a[:-9]) # removes the name of the annotator from the filename

    for b in dir2:
        if b.endswith('.csv'):
            unnamed2.append(b[:-9])

    setu1 = set(unnamed1)
    setu2 = set(unnamed2)

    there = list(setu1.intersection(setu2))
    missing = list(setu1-setu2)    

    filenames = {}
    for file in there:

        split_file = file.split('-')
        newspaper_name = '-'.join(split_file[:-5])

        if newspaper_name not in filenames:
            filenames[newspaper_name] = [file]
        else:
            filenames[newspaper_name].append(file)
    
    return there, filenames, missing

In [3]:
file_list, filenames, missing = get_files()

In [4]:
missing
# These are the files that were annotated only by one person.

['JОMTLANDSPOSTEN-1925-01-13-0002-',
 'GЩTHEBORGS-ALLEHANDA-1833-08-20-0004-',
 'KARLSHAMNS-ALLEHANDA-1856-06-23-0001-GDDHS_annotator1',
 'DAGENS-NYHETER-1975-02-01-0004-',
 'DAGENS-NYHETER-1975-02-01-0002-',
 'SVENSKA-DAGBLADET-1938-08-24-0004-',
 'SVENSKA-DAGBLADET-1938-08-24-0002-',
 'GЩTHEBORGS-ALLEHANDA-1833-08-20-0002-',
 'GЩTEBORGS-HANDELS--OCH-SJЩFARTSTIDNING-1832-10-15-0001-']

In [None]:
"""
Saving number of files per newspaper name.

Saves the number of files of each newspaper 
in the folder results/ as a csv file.
"""
num_files = []
for k, v in filenames.items():
    t = (k, len(v))
    num_files.append(t)
    
newsp_cols = pd.DataFrame(num_files, columns =['Newspaper name', 'Number of files'])
newsp_cols.to_csv(path_or_buf='results/num_files.csv', sep=',')

In [None]:
"""
Saving number of files per year.

This cell saves in the folder results/ the number 
of files available per year in a csv file.
"""
num_files_per_year = {}
for file in file_list:
    split_file = file.split('-')
    newspaper_name = '-'.join(split_file[:-5])
    year = split_file[-5]
    
    filename = 'annotator1/' + file + 'GDDHS.csv'
    with open(filename,"r") as f:
        annotations = len(f.readlines()) - 6
        
    if year in num_files_per_year:
        if newspaper_name in num_files_per_year[year]:
            num_files_per_year[year][newspaper_name][0] += 1
            num_files_per_year[year][newspaper_name][1] += annotations
        else:
            num_files_per_year[year][newspaper_name] = [1, annotations]
    else:
        num_files_per_year[year] = {newspaper_name : [1, annotations]}

        
nice_excel_num_files_per_year = []
for year, v in num_files_per_year.items():
    for newsp, num in v.items():
        nice_excel_num_files_per_year.append([year, newsp, num[0], num[1]])
files_year = pd.DataFrame(nice_excel_num_files_per_year)
files_year.to_csv(path_or_buf='results/num_files_per_year.csv', sep=',')

### Checking if all files have the same number of columns 

In [None]:
# A file which is missing no columns and has the correct annotations
f1 = 'annotator1/AFTONBLADET-1827-07-02-0002-GDDHS.csv'
df1 = pd.read_csv(f1, delimiter = '\t').fillna('Ignore')
col1 = list(df1.columns)

# Let's look for files which differ from the "good" one
miss = []
n = 0
for news in filenames:
    for file in filenames[news]:
        f2 = 'annotator2/' + file + 'gddea.csv'
        df2 = pd.read_csv(f2, delimiter = '\t').fillna('Ignore')
        cols = list(df2.columns)
        for e in col1:
            if e not in cols:
                n+= 1
                miss.append(e)
print(n)
print(miss)

"""
10 texts from annotator2 are missing the column 'skew text/Y/N'. 
No other columns are missing. 
No texts are missing from annotator1 (checked by copying the cell
and looking in the directory annotator1/).
"""

### Saving the number of annotations in a file

In [None]:
num_annotations
num_ann_df = pd.DataFrame(num_annotations, columns =['Newspaper', 'Annotations'])
num_ann_df.to_csv('num_annotations.csv')

### Information per year

In [5]:
def info_year_ignore_newspaper():
    """
    Gets the shared information (same annotations)
    for the files which are in both directories 
    (annotator1/ and annotator2/).
    
    Returns:
      - info: a dictionary containing the segment-
        level information that the annotators 
        agreed upon.
      - info_gral: a dictionary containing the 
        document-level information that the 
        annotators agreed upon.
      - annotations: the supposed number of segment-
        level annotations that all files should have.
    """
    
    file_list, filenames, missing = get_files()

    # clm : columns, lqi : low quality ink, cont : content, clr : colour print, skew : skew text
    info = {}

    # clm : columns, wpc : White paper colour, im : Photographic images, lqi : low quality ink
    info_gral = {}

    equiv_gral = {1 : 'clm_gral', 2 : 'wpc_gral', 3 : 'im_gral', 4 : 'lqi_gral'}
    equiv = {2 : 'clm', 3 : 'lqi', 4 : 'cont', 5 : 'clr', 6 : 'skew'}
    
    annotations = 0
    
    for newsp in file_list:
        
        f1 = 'annotator1/' + newsp + 'GDDHS.csv'
        f2 = 'annotator2/' + newsp + 'gddea.csv'

        df1 = pd.read_csv(f1, delimiter = '\t')
        df2 = pd.read_csv(f2, delimiter = '\t')
        
        col1 = list(df1.columns)
        col2 = list(df2.columns)
        
        if len(col1) > len(col2):
            df1 = df1.iloc[:, :-1]
        
        year = newsp.split('-')[-5]
        annotations += df1.shape[0] - 5

        for i, (x, y) in enumerate(zip(df1.values, df2.values)):
            
            comp = list(zip(x, y))
            
            for n, tup in enumerate(comp[1:], start=1): # to ignore the first element of each line
                
                if tup[0] == tup[1]:
                    key = str(tup[0])
                    if i == 0: # article name
                        pass
                    elif i < 5:
                        topic = equiv_gral[i]
                        if year in info_gral:                            
                            if key in info_gral[year][topic]:
                                info_gral[year][topic][key] += 1
                            else:
                                info_gral[year][topic][key] = 1
                        else:
                            info_gral[year] = {'clm_gral' : {}, 'wpc_gral' : {}, 'im_gral' : {}, 'lqi_gral' : {}}
                            info_gral[year][topic][key] = 1
                    else:
                        if n != 0 and n!=1:
                            topic = equiv[n]
                            if year in info:                                
                                if key in info[year][topic]:
                                    info[year][topic][key] += 1
                                else:
                                    info[year][topic][key] = 1
                            else:
                                info[year] = {'clm' : {}, 'lqi' : {}, 'cont' : {}, 'clr' : {}, 'skew' : {}}
                                info[year][topic][key] = 1
    
    return info, info_gral, annotations

In [6]:
info_year, info_gral_year, annotations = info_year_ignore_newspaper()

In [8]:
def comparison_year_ignore_newspaper():
    """
    Gets the different information (different 
    annotations) for the files which are in both 
    directories (annotator1/ and annotator2/).
    
    Returns:
      - info: a dictionary containing the segment-
        level informations that the annotators 
        disagreed upon.
      - info_gral: a dictionary containing the 
        document-level information that the 
        annotators disagreed upon.
      - annotations: a dictionary containing the 
        supposed number of segment-level annotations 
        that all files should have per year.
    """
    
    file_list, filenames, missing = get_files()

    # clm : columns, lqi : low quality ink, cont : content, clr : colour print, skew : skew text
    newspaper = {}

    # clm : columns, wpc : White paper colour, im : Photographic images, lqi : low quality ink
    newspaper_gral = {}

    equiv_gral = {1 : 'clm_gral', 2 : 'wpc_gral', 3 : 'im_gral', 4 : 'lqi_gral'}
    equiv = {2 : 'clm', 3 : 'lqi', 4 : 'cont', 5 : 'clr', 6 : 'skew'}
    
    num_annotations = {}
    
    for newsp in file_list:
        
        f1 = 'annotator1/' + newsp + 'GDDHS.csv'
        f2 = 'annotator2/' + newsp + 'gddea.csv'

        df1 = pd.read_csv(f1, delimiter = '\t').fillna('Ignore') 
        df2 = pd.read_csv(f2, delimiter = '\t').fillna('Ignore')
        
        col1 = list(df1.columns)
        col2 = list(df2.columns)
        
        if len(col1) > len(col2):
            df1 = df1.iloc[:, :-1]
        
        year = newsp.split('-')[-5]
        
        if year in num_annotations:
            num_annotations[year] += (df1.shape[0] - 6)
        else:
            num_annotations[year] = (df1.shape[0] - 6)

        for i, (x, y) in enumerate(zip(df1.values, df2.values)):
            
            TF = x == y
            if False in TF:
                comp = list(zip(x, y))
                for n, tup in enumerate(comp):
                    if tup[0] != tup[1]:
                        key = str(tup)
                        if i == 0:
                            pass
                        elif i < 5:
                            topic = equiv_gral[i]
                            if year in newspaper_gral:
                                if key in newspaper_gral[year][topic]:
                                    newspaper_gral[year][topic][key] += 1
                                else:
                                    newspaper_gral[year][topic][key] = 1
                            else:
                                newspaper_gral[year] = {'clm_gral' : {}, 'wpc_gral' : {}, 'im_gral' : {}, 'lqi_gral' : {}}
                                newspaper_gral[year][topic][key] = 1
                        else:
                            if n != 0:
                                topic = equiv[n]
                                if year in newspaper:
                                    if key in newspaper[year][topic]:
                                        newspaper[year][topic][key] += 1
                                    else:
                                        newspaper[year][topic][key] = 1
                                else:
                                    newspaper[year] = {'clm' : {}, 'lqi' : {}, 'cont' : {}, 'clr' : {}, 'skew' : {}}
                                    newspaper[year][topic][key] = 1
    
    return newspaper, newspaper_gral, num_annotations

In [9]:
info_year_discrepancies, info_gral_year_discrepancies, num_ann_year = comparison_year_ignore_newspaper()

In [None]:
num_ann_df = pd.DataFrame(num_annotations, columns =['Newspaper', 'Annotations'])
num_ann_df.to_csv('num_annotations.csv')

### Saving data in files

In [None]:
def nice(final_news, col_name, tipus):
    """
    Saves the information from the previous
    functions into a csv file. Called in the
    next two functions.
    
    Arguments:
      - final_news: one of the first two outputs
        of the previous functions
      - col_name: the name of the type of infor-
        mation that we want to save (each column
        in the original output of the functions).
      - tipus: the information being saved, gathered
        from the files get_final_files and 
        get_final_gral_files.
    """
    final = []

    for k, v in final_news.items():
        l = [k]
        for tag in tipus:
            if tag in v[col_name]:
                l.append(v[col_name][tag])
            else:
                l.append(0)
        final.append(l)
    
    newsp_cols = pd.DataFrame(final, columns =['Year'] + tipus)

    # This has been commented/uncommented depending on the information that we want to save
    # Saving the information that the annotators agreed upon.
    newsp_cols.to_csv(path_or_buf='results/same_info_year/' + col_name + '.csv', sep=',')
    
    # Saving the information that the annotators disagreed upon
#     newsp_cols.to_csv(path_or_buf='results/different_info_year/' + col_name + '.csv', sep=',')

In [None]:
def get_final_files(final_news):
    """
    Saves the segment-level information in an 
    easy-to-read format to create a csv file 
    from a pandas dataframe.
    """
    
    cols = ['clm', 'lqi', 'cont', 'clr', 'skew']

    for thing in cols:
        l = []
            
        for k, v in final_news.items():
            if v[thing]:
                for val in v[thing].keys():
                    if val not in l:
                        l.append(val)

        nice(final_news, thing, l)

In [None]:
def get_final_gral_files(final_news):
    """
    Saves the document-level information in an 
    easy-to-read format to create a csv file 
    from a pandas dataframe.
    """
    
    cols = ['clm_gral', 'wpc_gral', 'im_gral', 'lqi_gral']

    for thing in cols:
        l = []
            
        for k, v in final_news.items():
            if v[thing]:
                for val in v[thing].keys():
                    if val not in l:
                        l.append(val)

        nice(final_news, thing, l)

In [None]:
get_final_files(info_year)
get_final_gral_files(info_gral_year)

In [None]:
# Remember to change the folder in which it is being saved!
get_final_files(info_year_discrepancies)
get_final_gral_files(info_gral_year_discrepancies)