# Preprocessing notebook

In [56]:
import os
import json
from config import original_datasets, processed_datasets
from pathlib import Path
import json
import pandas as pd

## Import data

### Functions

In [2]:
# Adapted from Nath (2021)

def verify_file_exists(file_list,data_path):
    for file in file_list:
        if not Path(data_path+"/"+file).exists():
            print("Error! Could not find file: ",data_path+file)

def convertPAN_to_siamese_format(text, truth_data, problem_file):
    a1_list = []
    a2_list = []
    p1_list = []
    p2_list = []
    p_list = []
    paragraph_authors = truth_data['paragraph-authors']
    paragraphs = text.split("\n")
    non_zero_len_paragraphs = [para for para in paragraphs if len(para)>0] # problem-601 creates an empty paragraph causing the problem. so had to remove empty paragraphs
    if len(non_zero_len_paragraphs)!= len(paragraph_authors):
        print(f"Error! Number of paragraphs {len(non_zero_len_paragraphs)} does not equal paragraph authors {len(paragraph_authors)}! God help you now!")
        print(paragraph_authors)
        for para in non_zero_len_paragraphs:
            print(f"\n len: {len(para)} {para}") # For debugging
            return None, None, None, None
    else:
        for i in range(0, len(paragraph_authors)):
            a1 = paragraph_authors[i]
            if i+1<len(paragraph_authors) and (i+1)< len(paragraphs):
                a2 = paragraph_authors[i+1]
                p1 = non_zero_len_paragraphs[i]
                p2 = non_zero_len_paragraphs[i+1]
                a1_list.append(a1)
                p1_list.append(p1)
                a2_list.append(a2)
                p2_list.append(p2)
                p_list.append(problem_file)
    return p_list, a1_list, a2_list, p1_list, p2_list

def read_text_json(problem_path, truth_path):
    with open(problem_path) as f:
        text = f.read()
    print("len:", len(text))
    with open(truth_path) as json_file:
        data = json.load(json_file)
    return text, data

### Import files

In [4]:
# Code adapted from Nath (2021)

# We construct a loop for all four datasets (last year and this year) + task 2

dataset_keys = original_datasets.keys() # Get the keys

for dataset_key in dataset_keys:
    data_path = original_datasets[dataset_key]

    # Get and verify files

    print(data_path)
    os.listdir(data_path)
    base = [file.split(".")[0] for file in os.listdir(data_path) if "txt" in file]
    truth_files = ["truth-"+file+".json" for file in base]
    problem_files = [file+".txt" for file in base] 

    verify_file_exists(truth_files, data_path)
    verify_file_exists(problem_files, data_path)

    # Load and process files

    a1_list = []
    a2_list = []
    p1_list = []
    p2_list = []
    p_list = []

    for i in range(0,len(truth_files)): # E.g. truth-problem-1432.json
        truth_file = truth_files[i]
        problem_file = problem_files[i] # E.g. problem-1432.txt
        # print((truth_file, problem_file)) # For debugging
        text, data = read_text_json(data_path+"/"+problem_file, data_path+"/"+truth_file)
        p_list_temp, a1_list_temp, a2_list_temp, p1_list_temp, p2_list_temp = convertPAN_to_siamese_format(text,data, problem_file)
        if p_list_temp and a1_list_temp and a2_list_temp and p1_list_temp and p2_list_temp:
            p_list.extend(p_list_temp)
            a1_list.extend(a1_list_temp)
            a2_list.extend(a2_list_temp)
            p1_list.extend(p1_list_temp)
            p2_list.extend(p2_list_temp)
    data = {"problem":p_list, "author_1":a1_list, "author_2":a2_list, "para1_text": p1_list, "para2_text":p2_list}
    # print(len(p_list), len( a1_list),  len( a2_list),  len( p1_list), len( p2_list)) # For debugging
    
    # Put in dataframe and save as csv

    df = pd.DataFrame(data)
    df.to_csv(processed_datasets[dataset_key], index=False)

/Users/asmusharre/Documents/GitHub/project-2022-group-4/data/pan21/train
len: 2944
len: 1695
len: 2868
len: 1612
len: 2847
len: 1069
len: 1674
len: 3329
len: 2791
len: 1050
len: 1035
len: 1971
len: 1987
len: 1995
len: 1439
len: 1453
len: 1074
len: 1018
len: 1105
len: 2736
len: 1171
len: 1781
len: 2141
len: 1052
len: 1472
len: 1013
len: 1760
len: 1090
len: 2606
len: 4402
len: 1285
len: 1104
len: 3076
len: 2822
len: 1599
len: 1710
len: 1155
len: 1100
len: 1136
len: 1141
len: 1112
len: 2405
len: 2090
len: 1386
len: 1138
len: 1741
len: 1458
len: 1392
len: 1342
len: 1397
len: 1733
len: 2647
len: 1039
len: 1628
len: 1329
len: 1913
len: 1622
len: 1216
len: 1160
len: 2585
len: 2273
len: 1380
len: 2116
len: 2256
len: 2194
len: 3023
len: 1027
len: 1416
len: 2487
len: 1278
len: 2081
len: 2683
len: 1003
len: 1334
len: 1873
len: 1592
len: 2383
len: 1420
len: 1482
len: 1752
len: 1303
len: 1945
len: 1379
len: 1707
len: 1395
len: 1124
len: 1221
len: 1748
len: 2773
len: 1276
len: 2198
len: 2266
len: 23

### Check data

In [5]:
# Dimensions
# Last years data seems bigger than this years

dataset_keys = processed_datasets.keys() # Get the keys

for dataset_key in dataset_keys:
    training_data = pd.read_csv(processed_datasets[dataset_key])
    print(training_data.shape)

(66052, 5)
(14095, 5)
(9589, 5)
(2141, 5)
(45723, 5)
(9537, 5)


In [3]:
# Training 2022

pd.read_csv(processed_datasets['train_2022_task2']).head()

Unnamed: 0,problem,author_1,author_2,para1_text,para2_text
0,problem-1734.txt,1,2,Only if you notice sites breaking on you. It ...,I have always believed that passing any test i...
1,problem-1734.txt,2,3,I have always believed that passing any test i...,No. Acid3 is a goal or benchmark for browsers;...
2,problem-1734.txt,3,2,No. Acid3 is a goal or benchmark for browsers;...,"FWIW, my copy of FireFox scores 72/100 which i..."
3,problem-1734.txt,2,4,"FWIW, my copy of FireFox scores 72/100 which i...","No, your browser passing the Acid 3 test isn't..."
4,problem-3123.txt,1,2,You can simply force redirection on default we...,The old trick to just redirect the Default Web...


In [37]:
# Training 2022

pd.read_csv(processed_datasets['valid_2022']).head()

Unnamed: 0,problem,author_1,author_2,para1_text,para2_text
0,problem-136.txt,1,2,"I would go with Fiber, there is the possibilit...",Most of the key issues have been mentioned - F...
1,problem-136.txt,2,2,Most of the key issues have been mentioned - F...,One other key difference is that at the moment...
2,problem-122.txt,1,1,If you have the enterprise version of the soft...,If you don't have an enterprise version (you g...
3,problem-122.txt,1,2,If you don't have an enterprise version (you g...,"Sorry, you are ""done"" - MS does not support MS..."
4,problem-122.txt,2,2,"Sorry, you are ""done"" - MS does not support MS...",I sugest geting the (official) logon script wo...


### Descriptive statistics

In [63]:
training_data = pd.read_csv(processed_datasets['train_2022_task2'])
training = training_data.iloc[0:int(0.8*len(training_data))].copy()
validation = training_data.iloc[int(0.8*len(training_data)):].copy()
#Validation set is testing set for our purposes
testing = pd.read_csv(processed_datasets['valid_2022_task2'])

In [64]:
#Calculating proportion of times when author changes
training['same_author'] = training.apply(lambda x:  1 if x['author_1'] != x['author_2'] else 0, axis=1)
validation['same_author'] = validation.apply(lambda x:  1 if x['author_1'] != x['author_2'] else 0, axis=1)
testing['same_author'] = testing.apply(lambda x:  1 if x['author_1'] != x['author_2'] else 0, axis=1)

In [70]:
print("Training - proportion of para pairs which are a change",sum(training['same_author'])/training.shape[0])
print("Validation - proportion of para pairs which are a change",sum(validation['same_author'])/validation.shape[0])
print("Testing - proportion of para pairs which are a change",sum(testing['same_author'])/testing.shape[0])

Training - proportion of para pairs which are a change 0.6160533654109027
Validation - proportion of para pairs which are a change 0.6104975396391471
Testing - proportion of para pairs which are a change 0.6245150466603754


In [65]:
print("Number of paragraph pairs in training data:",training.shape[0])
print("Number of paragraph pairs in validation data:",validation.shape[0])
print("Number of paragraph pairs in testing data:",testing.shape[0])

Number of paragraph pairs in training data: 36578
Number of paragraph pairs in validation data: 9145
Number of paragraph pairs in testing data: 9537


In [72]:
#Calculating number of words
training['words_in_para1'] = training.para1_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))
training['words_in_para2'] = training.para2_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))
validation['words_in_para1'] = validation.para1_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))
validation['words_in_para2'] = validation.para2_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))
testing['words_in_para1'] = testing.para1_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))
testing['words_in_para2'] = testing.para2_text.apply(lambda x: 1+len(x.strip())-len(x.strip().replace(" ","")))

In [73]:
#Training and validation data
print('Training:')
print("mean no. of words",training['words_in_para1'].append(training['words_in_para2']).mean())
print("median no. of words",training['words_in_para1'].append(training['words_in_para2']).median())
print("min no. of words",training['words_in_para1'].append(training['words_in_para2']).min())
print("max no. of words",training['words_in_para1'].append(training['words_in_para2']).max())

print('\nValidation:')
print("mean no. of words",validation['words_in_para1'].append(validation['words_in_para2']).mean())
print("median no. of words",validation['words_in_para1'].append(validation['words_in_para2']).median())
print("min no. of words",validation['words_in_para1'].append(validation['words_in_para2']).min())
print("max no. of words",validation['words_in_para1'].append(validation['words_in_para2']).max())

print('\nTesting')
print("mean no. of words",testing['words_in_para1'].append(testing['words_in_para2']).mean())
print("median no. of words",testing['words_in_para1'].append(testing['words_in_para2']).median())
print("min no. of words",testing['words_in_para1'].append(testing['words_in_para2']).min())
print("max no. of words",testing['words_in_para1'].append(testing['words_in_para2']).max())

Training:
mean no. of words 43.79487670184264
median no. of words 37.0
min no. of words 1
max no. of words 819

Validation:
mean no. of words 43.917277200656095
median no. of words 37.0
min no. of words 1
max no. of words 251

Testing
mean no. of words 43.68653664674426
median no. of words 37.0
min no. of words 1
max no. of words 228


In [66]:
#Calculating average number of authors per document in training data
number_of_authors = []
for problem_id in list(set(training.problem)):
    temp_df = training[training.problem == problem_id]
    number_of_authors.append(len(list(set(temp_df.author_1.append(temp_df.author_2)))))
print("Mean number of authors in training docs:", sum(number_of_authors)/len(number_of_authors))

Mean number of authors in training docs: 3.000356760613628


In [67]:
#Calculating average number of authors per document in validation data
number_of_authors = []
for problem_id in list(set(validation.problem)):
    temp_df = validation[validation.problem == problem_id]
    number_of_authors.append(len(list(set(temp_df.author_1.append(temp_df.author_2)))))
print("Mean number of authors in validation docs:", sum(number_of_authors)/len(number_of_authors))

Mean number of authors in validation docs: 2.997132616487455


In [51]:
#Calculating average number of authors per document in testing data
number_of_authors = []
for problem_id in list(set(testing.problem)):
    temp_df = testing[testing.problem == problem_id]
    number_of_authors.append(len(list(set(temp_df.author_1.append(temp_df.author_2)))))
print("Mean number of authors in training docs:", sum(number_of_authors)/len(number_of_authors))

Mean number of authors in training docs: 3.0
