In [1]:
import pandas as pd


def adding_headers_and_add_IDs_to_sentences(input_filepath, output_filepath):
    sentence_id = 1  # Initialize sentence ID counter

    # Open input file for reading and output file for writing    
    with open(input_filepath, 'r', encoding='utf-8') as infile, \
         open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line = line.strip()  # Remove leading/trailing whitespaces

            # Skip lines that are either empty or start with '#'
            if line and not line.startswith('#'):
                # Write the line along with the sentence ID to the output file
                outfile.write(f'{sentence_id}\t{line}\n')
            else:
                # If the line is empty, it marks the end of a sentence
                if not line:
                    sentence_id += 1  # Increment sentence ID for the next sentence

                    
    # Open the processed output file for reading         
    with open(output_filepath, encoding='utf-8') as file:
        lines = file.readlines()

    # Split each line into columns based on the tab character
    data = [line.strip().split('\t') for line in lines]
    
    # Find the maximum number of columns in any row
    max_columns = max(len(row) for row in data)
    print(f'max number of columns: {max_columns}')
    
    # Create a DataFrame with column names 'col_0', 'col_1', ..., 'col_(max_columns-1)'    
    new_df = pd.DataFrame(data, columns=[f'col_{i}' for i in range(max_columns)])

# --- code above not changed ---

    
    # Rename only the first 11 columns (sentence_num + 10 other - see assignment guidelines)
    additional_cols = [f'col_{i}' for i in range(12, max_columns)]    ### 11 changed to 12
    new_df.columns = ['sentence_id', 'token_id', 'token', 'lemma', 'UPOS', 'POS', 'grammar', 'head_id', 'dependency_relation',
                      'head_dependency_relation', 'additional_info', 'PropBank_frames'] + additional_cols   ### 'token_id' added, 'is_predicate' changed to 'PropBank_frames', 'sentence_num' changed to 'sentence_id' 
    
    return new_df        




# if __name__ == '__main__':
    
#     # Perform the preprocessing and save the result to a CSV file
#     df = adding_headers_and_add_IDs_to_sentences('../Data/en_ewt-up-train.conllu', '../Data/en_ewt-up-train_new.conllu')
#     # df.to_csv('../Data/train(Nur).tsv', index=False)
#     df.to_csv('../Data/train_header_added.tsv', sep='\t', index=False)

In [2]:
# paths to the original training data, new traing data, and expected training data saved as a tsv file
train_data = '../data/en_ewt-up-train.conllu'
train_data_new= '../data/train_senID_added.conllu'
train_tsv = '../data/train_header_added.tsv'

# paths to the original test data, new test data, and expected test data saved as a tsv file
test_data = '../data/en_ewt-up-test.conllu'
test_data_new = '../data/test_senID_added.conllu'
test_tsv = '../data/test_header_added.tsv'

# call the function for both the training and test datasets, then save the results to TSV files respectively
train_df = adding_headers_and_add_IDs_to_sentences(train_data, train_data_new)
train_df.to_csv(train_tsv, sep='\t', index=False)

test_df = adding_headers_and_add_IDs_to_sentences(test_data, test_data_new)
test_df.to_csv(test_tsv, sep='\t', index=False)


max number of columns: 47
max number of columns: 30


In [3]:
def duplicate_train_instances(input_path, output_path):
    
    with open(input_path, "r", encoding = "utf-8") as inputfile:
        for line in inputfile:
            columns = line.strip().split("\t")

            # condition1: sentences with 1 predicate
            if len(columns) == 13:
                new_line = "\t".join(columns)+"\n"
                with open(output_path, "a", encoding="utf-8") as output:
                    output.write(new_line)


            # condition2: sentences with more than 1 predicates
            else:
                for i in range(14, 48):
                    if len(columns) == i:
                        # combine first 12 columns with different V&Arug column
                        new_line = "\t".join(columns[0:12] + [columns[i-1]]) + "\n"
                        with open(output_path, "a", encoding="utf-8") as output:
                            output.write(new_line)


In [4]:
duplicated_train = '../data/duplicated_train.tsv'

duplicate_train_instances(train_tsv, duplicated_train)

In [5]:
def duplicate_test_instances(input_path, output_path):
    
    with open(input_path, "r", encoding = "utf-8") as inputfile:
        for line in inputfile:
            columns = line.strip().split("\t")

            # condition1: sentences with 1 predicate
            if len(columns) == 13:
                new_line = "\t".join(columns)+"\n"
                with open(output_path, "a", encoding="utf-8") as output:
                    output.write(new_line)


            # condition2: sentences with more than 1 predicates
            else:
                for i in range(14, 31):
                    if len(columns) == i:
                        # combine first 12 columns with different V&Arug column
                        new_line = "\t".join(columns[0:12] + [columns[i-1]]) + "\n"
                        with open(output_path, "a", encoding="utf-8") as output:
                            output.write(new_line)

In [6]:
duplicated_test = '../data/duplicated_test.tsv'

duplicate_test_instances(test_tsv, duplicated_test)

In [7]:
import csv

def add_true_lables_4arguments(input_file, output_file):
    '''
    This function add label to each token indicating if it is an argument.

    Input: 
    -input_file: filepath to the input file
    -output_file: filepath to the output file
    '''
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
            open(output_file, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            # check if the row is not empty
            if len(row) == 13:
                
                # condition1：the current token is not an argument
                if row[12] == "_" or row[12] == "V":
                    row.append("O")
                    
                # condition2：the current token is an argument 
                else:
                    row.append(row[12])
                                                                                        
            writer.writerow(row)

In [8]:
# paths to save preprocessed training ans test data
preprocessed_train = '../data/preprocessed_train.tsv'
preprocessed_test = '../data/preprocessed_test.tsv'

# call function to get preprocessed training and test datasets
add_true_lables_4arguments(duplicated_train, preprocessed_train)
add_true_lables_4arguments(duplicated_test, preprocessed_test)

In [9]:
def change_tsv_header(input_file_path, new_header, output_file_path):
    # Read the TSV file and store the data
    with open(input_file_path, 'r', newline='', encoding='utf-8') as input_file:
        reader = csv.reader(input_file, delimiter='\t')
        data = list(reader)

    # Change the header
    data[0] = new_header

    # Write to a new file with the updated header
    with open(output_file_path, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file, delimiter='\t')
        writer.writerows(data)

    print(f"Header changed. New Header: {new_header}")

In [10]:
preprocessed_train_with_header = '../data/preprocessed_train_with_header.tsv'
preprocessed_test_with_header = '../data/preprocessed_test_with_header.tsv'

new_header = ['sentence_id', 'token_id', 'token', 'lemma', 'UPOS', 'POS', 'grammar', 'head_id', 'dependency_relation', 'head_dependency_relation', 'additional_info', 'PropBank_frames', 'annotation', 'label']

change_tsv_header(preprocessed_train, new_header, preprocessed_train_with_header)
change_tsv_header(preprocessed_test, new_header,preprocessed_test_with_header)

Header changed. New Header: ['sentence_id', 'token_id', 'token', 'lemma', 'UPOS', 'POS', 'grammar', 'head_id', 'dependency_relation', 'head_dependency_relation', 'additional_info', 'PropBank_frames', 'annotation', 'label']
Header changed. New Header: ['sentence_id', 'token_id', 'token', 'lemma', 'UPOS', 'POS', 'grammar', 'head_id', 'dependency_relation', 'head_dependency_relation', 'additional_info', 'PropBank_frames', 'annotation', 'label']
