## Preproccess ASSESS CT data
This processes the original ASSESS CT .xlxs file as published by (Miñarro-Giménez et al., 2019). The data filtered for German texts only and split up to be exported into .txt files to allow further usage e.g. for manual annotations.

In [74]:
import pandas as pd

df = pd.read_excel('../data/ASSESS CT/pone.0209547.s002.xlsx', header=[0,1])
df.columns = [f'{a} {b}' if not b.startswith('Unnamed:') else a for a, b in df.columns]


print(df.head())


  Language Annotator_ID Snippet_ID  Sentence_ID     Token Chunk SCT_ONLY CODE  \
0       EN          EN1        NL0            0  Exercise     1           NaN   
1       EN          EN1        NL0            0  capacity     1           NaN   
2       EN          EN1        NL0            0        is     1           NaN   
3       EN          EN1        NL0            0     stage     1           NaN   
4       EN          EN1        NL0            0         2     1           NaN   

   SCT_ONLY CONCEPT COVERAGE SCT_ONLY TERM COVERAGE       UMLS_EXT CODE  \
0                        3.0                    NaN            C0948372   
1                        3.0                    NaN            C0948372   
2                        NaN                    NaN                 NaN   
3                        3.0                    NaN  C1882085, C1882086   
4                        3.0                    NaN            C1882085   

   UMLS_EXT CONCEPT COVERAGE UMLS_EXT TERM COVERAGE LOCAL CODE

In [75]:
#Filtering
filtered_DE_df = df[df['Language'] == 'DE']

filtered_DE1_df = df[df['Annotator_ID'] == 'DE1']
filtered_DE2_df = df[df['Annotator_ID'] == 'DE2']

#check possible duplicates / overlaps in annotated datasets
print("All %d Tokens Annotated by DE1 are also part of DE2: %s" % (len(filtered_DE1_df), filtered_DE1_df['Token'].isin(filtered_DE2_df['Token']).all()))
print("All %0d Tokens Annotated by DE2 are also part of DE1:  %s" % (len(filtered_DE2_df), filtered_DE2_df['Token'].isin(filtered_DE1_df['Token']).all()))

# --> Proceeding with filtered_DE2_df only

All 1817 Tokens Annotated by DE1 are also part of DE2: True
All 5486 Tokens Annotated by DE2 are also part of DE1:  False


In [76]:
#Insights on sentence and token structure
sentences_token_per_document = filtered_DE2_df.groupby(['Snippet_ID']).agg({'Sentence_ID': 'max', 'Token': 'count'})
sentences_token_per_document = sentences_token_per_document.rename(columns={'Sentence_ID': 'Sentences', 'Token': 'Tokens'})
sentences_token_per_document['Sentences'] += 1 #Take care of Null Indices
sentences_token_per_document.describe()

Unnamed: 0,Sentences,Tokens
count,60.0,60.0
mean,9.55,91.416667
std,4.10632,16.451564
min,3.0,48.0
25%,7.0,84.0
50%,9.0,91.0
75%,12.0,99.25
max,25.0,138.0


In [77]:
# Export into .txt files
import os

output_dir = './txt_output'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for snippet_id in filtered_DE2_df['Snippet_ID'].unique():
    snippet_df = filtered_DE2_df[filtered_DE2_df['Snippet_ID'] == snippet_id]
    first_line = True
    with open(f"{output_dir}/{snippet_id}.txt", "w") as f:
        for sentence_id in snippet_df['Sentence_ID'].unique():
            sentence_df = snippet_df[snippet_df['Sentence_ID'] == sentence_id]
            sentence_tokens = sentence_df['Token'].astype(str).tolist()
            sentence = ' '.join(sentence_tokens)
            if first_line:
                f.write(sentence)
                first_line = False
            else:
                f.write('\n' + sentence)