In [1]:
import benepar, spacy, nltk
import pandas as pd
from nltk.corpus import brown

In [2]:
def generate_ngrams(df):
    
    forward_bigram_list = []
    backward_bigram_list = []
    forward_trigram_list = []
    backward_trigram_list = []

    pos_forward_trigram_list = []
    pos_backward_trigram_list = []

    for identifier in range(1, max(df['unique_identifier']) + 1):
        sentence = df.loc[df['unique_identifier'] == identifier]

        # Token N-grams:
        
        # Forward bigram
        fw_shifted = sentence['word'].shift(-1)
        fw_bigram = sentence['word'] + ' ' + fw_shifted.fillna('')
        forward_bigram_list += list(fw_bigram)

        # Forward trigram
        fw_shifted2 = sentence['word'].shift(-2)
        fw_trigram = sentence['word'] + ' ' + fw_shifted.fillna('') + ' ' + fw_shifted2.fillna('')
        forward_trigram_list += list(fw_trigram)

        # Backward bigram
        bw_shifted = sentence['word'].shift(1)
        bw_bigram = bw_shifted.fillna('') + ' ' + sentence['word']
        backward_bigram_list += list(bw_bigram)

        # Backward trigram
        bw_shifted2 = sentence['word'].shift(2)
        bw_trigram = bw_shifted2.fillna('') + ' ' + bw_shifted.fillna('') + ' ' + sentence['word']
        backward_trigram_list += list(bw_trigram)

        # PoS N-grams

        # POS forward bigram
        pos_fw_shifted = sentence['PoS'].shift(-1)
        pos_fw_bigram = sentence['PoS'] + ' ' + pos_fw_shifted.fillna('')

        # POS forward trigram
        pos_fw_shifted2 = sentence['PoS'].shift(-2)
        pos_fw_trigram = sentence['PoS'] + ' ' + pos_fw_shifted.fillna('') + ' ' + pos_fw_shifted2.fillna('')
        pos_forward_trigram_list += list(pos_fw_trigram)

        # POS backward bigram
        pos_bw_shifted = sentence['PoS'].shift(1)
        pos_bw_bigram = pos_bw_shifted.fillna('') + ' ' + sentence['PoS']

        # POS backward trigram
        pos_bw_shifted2 = sentence['PoS'].shift(2)
        pos_bw_trigram = pos_bw_shifted2.fillna('') + ' ' + pos_bw_shifted.fillna('') + ' ' + sentence['PoS']
        pos_backward_trigram_list += list(pos_bw_trigram)

    df['forward_bigram'] = forward_bigram_list
    df['backward_bigram'] = backward_bigram_list
    df['forward_trigram'] = forward_trigram_list
    df['backward_trigram'] = backward_trigram_list
    df['pos_forward_trigram'] = pos_forward_trigram_list
    df['pos_backward_trigram'] = pos_backward_trigram_list


#### Loading data and basic preprocessing

In [4]:
import pandas as pd

def load_data(datafile):
    # Read the file and split lines
    with open(datafile, 'r') as file:
        lines = file.readlines()

    # Split lines into data
    data_entries = [line.strip().split('\t') for line in lines]
    
    # Determine maximum number of columns
    max_columns = max(len(entry) for entry in data_entries)
    
    # Create DataFrame with placeholder column names
    data_frame = pd.DataFrame(data_entries, columns=[f'Column_{i}' for i in range(max_columns)])
    
    # Define initial column names
    initial_columns = ['file', 'sentence_num', 'token_num', 'word', 'lemma', 'PoS', 'parse_tree_info', 'neg_cue', 'scope', 'neg_event']
    
    # Rename initial columns
    data_frame.columns = initial_columns + [f'Column_{i}' for i in range(10, max_columns)]

    # Remove non-unique columns
    data_frame = data_frame.drop([col for col in data_frame.columns if data_frame.columns.get_loc(col) >= 10], axis=1)

    # Add a new column 'unique_identifier' based on 'token_num' values
    is_sentence_start = data_frame['token_num'] == '0' 
    data_frame['unique_identifier'] = is_sentence_start.cumsum()
    
    return data_frame

df = load_data('data/SEM-2012-SharedTask-CD-SCO-dev-09032012.txt')
display(df)

Unnamed: 0,file,sentence_num,token_num,word,lemma,PoS,parse_tree_info,neg_cue,scope,neg_event,unique_identifier
0,wisteria01,0,0,1.,1.,CD,(NP(NP*,***,,,1
1,wisteria01,0,1,The,The,NNP,*,***,,,1
2,wisteria01,0,2,Singular,Singular,NNP,*,***,,,1
3,wisteria01,0,3,Experience,Experience,NN,*),***,,,1
4,wisteria01,0,4,of,of,IN,(PP*,***,,,1
...,...,...,...,...,...,...,...,...,...,...,...
14348,wisteria02,439,9,orthodox,orthodox,JJ,*),***,,,787
14349,wisteria02,439,10,in,in,IN,(PP*,***,,,787
14350,wisteria02,439,11,his,his,PRP$,(NP*,***,,,787
14351,wisteria02,439,12,ritual,ritual,NN,*)))))),***,,,787


In [5]:
generate_ngrams(df)

In [6]:
df.head()

Unnamed: 0,file,sentence_num,token_num,word,lemma,PoS,parse_tree_info,neg_cue,scope,neg_event,unique_identifier,forward_bigram,backward_bigram,forward_trigram,backward_trigram,pos_forward_trigram,pos_backward_trigram
0,wisteria01,0,0,1.,1.,CD,(NP(NP*,***,,,1,1. The,1.,1. The Singular,1.,CD NNP NNP,CD
1,wisteria01,0,1,The,The,NNP,*,***,,,1,The Singular,1. The,The Singular Experience,1. The,NNP NNP NN,CD NNP
2,wisteria01,0,2,Singular,Singular,NNP,*,***,,,1,Singular Experience,The Singular,Singular Experience of,1. The Singular,NNP NN IN,CD NNP NNP
3,wisteria01,0,3,Experience,Experience,NN,*),***,,,1,Experience of,Singular Experience,Experience of Mr.,The Singular Experience,NN IN NNP,NNP NNP NN
4,wisteria01,0,4,of,of,IN,(PP*,***,,,1,of Mr.,Experience of,of Mr. John,Singular Experience of,IN NNP NNP,NNP NN IN
