In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize


def neel_sentences(gs_file, tsv_file):
    gs_col_names=['tweet_id','start','end','uri', 'confidence', 'entity']
    tsv_col_names=['tweet_id','text']
    tweets_dict = {}
    data_dict = {}
    seen_ids = set()
    sent = []
    entity = []
    unknown_indicies = set()
    
    gs_df = pd.read_table(gs_file, sep = '\t', header=None, names=gs_col_names)
    # fixes entity label at index 4805 that is incorrect
    if len(gs_df['entity']) > 4805 and gs_df['entity'][4805] == 'Organization373937812812615000':
        gs_df.at[4805, 'entity'] = 'Organization'
    
    tsv_df = pd.read_table(tsv_file, sep = ',', header=None, names=tsv_col_names)
    # strip '|' character from the edges of tsv_df column values
    tsv_df['tweet_id'] = tsv_df['tweet_id'].apply(lambda x: str(x).strip('|'))
    tsv_df['text'] = tsv_df['text'].apply(lambda x: str(x).strip('|'))

    for index, row in tsv_df.iterrows():
        tweets_dict[row['tweet_id']] = row['text']
    
    for index, row in gs_df.iterrows():
        tweet_id = str(row['tweet_id'])
        start = row['start']
        end = row['end']
        old_ent = row['entity']
        
        # Rename entity values as PER, LOC, ORG, MISC, O
        if old_ent in ('Character', 'Person'):
            new_ent = 'PER'
        elif old_ent == 'Location':
            new_ent = 'LOC'
        elif old_ent == 'Organization':
            new_ent = 'ORG'
        else:
            new_ent = 'MISC'
        
        try:
            text = tweets_dict[tweet_id]
            if tweet_id not in seen_ids:
                seen_ids.add(tweet_id)
                words = word_tokenize(text)
                labels = ['O']*len(words)
            else:
                words = data_dict[tweet_id]['words']
                labels = data_dict[tweet_id]['labels']
            assert(len(words)==len(labels))
            ent_words = word_tokenize(text[start:end])
            for e in ent_words:
                for i in range(len(words)):
                    if e == words[i]:
                        labels[i] = new_ent
            data_dict[tweet_id] = {'words': words, 'labels': labels}
        except KeyError:
            unknown_indicies.add(tweet_id)
    
    for key in data_dict:
        sent.append(data_dict[key]['words'])
        entity.append(data_dict[key]['labels'])
    
    return sent, entity, unknown_indicies


def neel_words(gs_file, tsv_file):
    """NEEL2006 words from gs and tsv files
    
    Seperating NEEL data into individual words with corresponding tags
    
    arguments: gs_file, tsv_file
    returns: words, entities, unknown tweet IDs
    """
    
    all_words = []
    all_entities = []
    all_errors = set()
    
    sent, entity, errors = neel_sentences(gs_file, tsv_file)

    for se in sent:
        for w in se:
            all_words.append(w)
    for en in entity:
        for e in en:
            all_entities.append(e)
    for er in errors:
        all_errors.add(er)
            
    return all_words, all_entities, all_errors


def conll_sentences(conll_file):
    sent = []
    pos = []
    chunk = []
    entity = []
    temp_sent = []
    temp_pos = []
    temp_chunk = []
    temp_entity = []
    with open(conll_file,'r',encoding='utf-8') as f:
        conll_raw_data = f.readlines()
    conll_raw_data = [x.strip() for x in conll_raw_data]
    print(conll_raw_data[:5])
    for line in conll_raw_data:
        if line != '':
            split_line = line.split()
            if len(split_line) == 4:
                temp_sent.append(split_line[0])
                temp_pos.append(split_line[1])
                temp_chunk.append(split_line[2])
                old_ent = split_line[3]
                if old_ent in ('I-ORG', 'B-ORG'):
                    new_ent = 'ORG'
                elif old_ent in ('I-LOC', 'B-LOC'):
                    new_ent = 'LOC'
                elif old_ent in ('I-MISC', 'B-MISC'):
                    new_ent = 'MISC'
                elif old_ent in ('I-PER', 'B-PER'):
                    new_ent = 'PER'
                else:
                    new_ent = 'O'
                temp_entity.append(new_ent)
    
    return temp_sent, temp_pos, temp_chunk, temp_entity


def conll_words(conll_file):
    all_words = []
    all_pos = []
    all_chunk = []
    all_entities = []
    
    sent, pos, chunk, entity = conll_sentences(conll_file)

    for se in sent:
        for w in se:
            all_words.append(w)
    for po in pos:
        for p in po:
            all_pos.append(p)
    for ch in chunk:
        for c in ch:
            all_chunk.append(c)
    for en in entity:
        for e in en:
            all_entities.append(e)
            
    return all_words, all_pos, all_chunk, all_entities

In [19]:
train_words, _, _, train_entities = conll_words('train.txt')

['apollo N B-NP O', 'tổ_chức V B-VP O', 'cuộc N B-NP O', 'thi V B-VP O', 'tìm_hiểu V B-VP O']


In [20]:
train_words

['a',
 'p',
 'o',
 'l',
 'l',
 'o',
 't',
 'ổ',
 '_',
 'c',
 'h',
 'ứ',
 'c',
 'c',
 'u',
 'ộ',
 'c',
 't',
 'h',
 'i',
 't',
 'ì',
 'm',
 '_',
 'h',
 'i',
 'ể',
 'u',
 'g',
 'i',
 'á',
 'o',
 '_',
 'd',
 'ụ',
 'c',
 's',
 'i',
 'n',
 'g',
 'a',
 'p',
 'o',
 'r',
 'e',
 '(',
 'n',
 'l',
 'đ',
 ')',
 '-',
 't',
 'ổ',
 '_',
 'c',
 'h',
 'ứ',
 'c',
 'g',
 'i',
 'á',
 'o',
 '_',
 'd',
 'ụ',
 'c',
 'v',
 'à',
 'đ',
 'à',
 'o',
 '_',
 't',
 'ạ',
 'o',
 'a',
 'p',
 'o',
 'l',
 'l',
 'o',
 'v',
 'i',
 'ệ',
 't',
 'n',
 'a',
 'm',
 'v',
 'ừ',
 'a',
 't',
 'h',
 'ô',
 'n',
 'g',
 '_',
 'b',
 'á',
 'o',
 't',
 'ổ',
 '_',
 'c',
 'h',
 'ứ',
 'c',
 'c',
 'u',
 'ộ',
 'c',
 't',
 'h',
 'i',
 '“',
 't',
 'ì',
 'm',
 '_',
 'h',
 'i',
 'ể',
 'u',
 'g',
 'i',
 'á',
 'o',
 '_',
 'd',
 'ụ',
 'c',
 's',
 'i',
 'n',
 'g',
 'a',
 'p',
 'o',
 'r',
 'e',
 '”',
 'd',
 'à',
 'n',
 'h',
 'c',
 'h',
 'o',
 'h',
 's',
 '-',
 's',
 'v',
 't',
 'ừ',
 '1',
 '5',
 'đ',
 'ế',
 'n',
 '2',
 '3',
 't',
 'u',
 'ổ',
 'i',
 ','