In [1]:
import os
import nltk
import pandas as pd
from unidecode import unidecode

In [2]:
dataPath = ".\\data"

In [3]:
def find_files(search_path):
    result = []
    # Wlaking top-down from the root
    for root, dir, files in os.walk(search_path):
        for file in files:       
                result.append(os.path.join(root, file))
    return result

In [4]:
trainFiles = find_files(dataPath + "\\train")
testFiles = find_files(dataPath + "\\test")
devFiles = find_files(dataPath + "\\dev")

In [5]:
def loadData(paths):
    data = pd.DataFrame(columns=["tokens", "label"])
    for i in paths:
        try:
            doc = pd.read_csv(i, sep="\t", names=["tokens", "label"], header=None)
            doc['file'] = i[12:-5]
            data = pd.concat([data, doc], ignore_index=True)
        except Exception as e: 
            print(i, e)
    return data

In [6]:
train = loadData(trainFiles)
test  = loadData(testFiles)
dev = loadData(devFiles)
print("Training Size before Cleaning")
print("Training Size:\t", train.shape)
print("Test Size:\t", test.shape)
print("Dev Size:\t", dev.shape)

Training Size before Cleaning
Training Size:	 (21549, 3)
Test Size:	 (2781, 3)
Dev Size:	 (2414, 3)


In [15]:
def stripSpaces(x):
    x = unidecode(x)
    specialchar = "!@#$%^&*()[]{};:,./<>?\|`-~=_+\t\n"
    for tag in specialchar:
        x = x.replace(tag, '')
    x = x.replace(" ", "")
    x = x.lower()
    x = x.strip()
    return x

def cleaning(dataset, verbose=True):
    dataset.drop(dataset[dataset["tokens"].isna()].index, inplace=True)
    if verbose: 
        print("Size after Dropping Null Tokens",dataset.shape)
        print("Tokens Without labels:")
    for indexWithNullLabel in dataset[dataset["label"].isna()].index:
        token = dataset["tokens"][indexWithNullLabel]
        #split with ' ' doesnt consider multiple spaces as one
        tokenslist = token.split()
        dataset["tokens"][indexWithNullLabel] = tokenslist[0]

        if (len(tokenslist) > 1):
            dataset["label"][indexWithNullLabel] = tokenslist[1]
        else:
            if verbose:
                print(dataset.loc[indexWithNullLabel, :])
            #Manual Correction for 5467 and 5858 (very & research)
            dataset["label"][indexWithNullLabel] = 'O'
            if verbose:
                print("Manual Corrected:", dataset["tokens"][indexWithNullLabel])
    dataset = dataset.applymap(stripSpaces)
    #label to handel 0, i*, b*, o*, 0*
    dataset[dataset["label"] == 'ii'] = 'i'
    dataset[dataset["label"] == '0'] = 'o'
    if verbose:
        print("Removing special characters")
    specialCharTokens = dataset[~(dataset["tokens"].str.isalnum())]["tokens"].unique()
    #for sprecialChar with label B, moving label to next row and droping rows  
    specialCharWithB = dataset[dataset["tokens"].isin(specialCharTokens) & (dataset["label"] == 'b')].index
    for i in specialCharWithB:
        dataset.loc[i+1, "label"] = 'b'
    dataset.drop(dataset[dataset["tokens"].isin(specialCharTokens) & ((dataset["label"] == 'o') | (dataset["label"] == 'b') )].index, inplace=True)
    #Drop i where there is i and b before it
    toDrop = []
    for i in dataset[dataset["tokens"].isin(specialCharTokens)].index:
        if(dataset["label"][i-1] == 'b' or dataset["label"][i-1] == 'i' ):
            toDrop.append(i)
        else:
            dataset["label"][i] = 'b'
    dataset.drop(toDrop, axis=0, inplace=True)
    if verbose:
        print(dataset.value_counts()[:30])
        print("Removing Stopwords based on above listed most frequent words")
    stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]
    if verbose:
        print("Label order correction:")
    dataset.reset_index(drop=True, inplace=True)
    temp = dataset.copy()
    temp["before"] = temp["label"].shift(1)
    temp["after"] = temp["label"].shift(-1)
    for i in temp[(temp["label"] == 'i') & (temp["before"] == 'o') ].index:
            # oio or oii
            print(temp.loc[i-1, "tokens"]+"("+temp.loc[i-1, "label"]+")\t\t", temp.loc[i, "tokens"]+"("+temp.loc[i, "label"]+")\t\t", temp.loc[i+1, "tokens"]+"("+temp.loc[i+1, "label"]+")")
            if(temp.loc[i+1, "label"] == 'o' or temp.loc[i+1, "label"] == 'i'):
                dataset.loc[i, "label"] = 'b'
            # oib
            if(temp.loc[i+1, "label"] == 'b'):
                dataset.loc[i, "label"] = 'b'
                dataset.loc[i+1, "label"] = 'i'
    del temp
    temp = dataset.copy()
    temp[(temp["tokens"].isin(stopwords))]["tokens"].value_counts().index
    dataset.reset_index(drop=True, inplace=True)
    return dataset

In [13]:
train = cleaning(train, verbose=False)

In [14]:
stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]

In [None]:
# of and the is often used in between 
# And seprates the Keywords may be good to not remove index [:]
# Arabic NER only NER is Keyword

#change to o and next one to b
# 887, 12087, 13296 

#state of the art
# 2077, 2145, 2364, 5457, 5683, 6826, 6879,9720, 11570 12038 12454 12869 14087 14143 14204 14952 

#remove all is, bs and is
#  11819 11832 12006 12125 12597 

#correct it
# 11855  11878 12521 12578 12664 12665 12682 13072 13082 13166 13224 13301 15732 18273 

#check later
# 7943 11620 11695 12024 13990 

#name of a person
# 14237 14288 

for i in temp[(temp["tokens"].isin(temp[(temp["tokens"].isin(stopwords))]["tokens"].value_counts().index)) & (temp["label"] != 'o')].index[:]:
    print(temp.loc[i, 'file'] +"(",i,"):"+
          temp.loc[i-4, 'tokens']+"(" + temp.loc[i-4, 'label'] +") "+ temp.loc[i-3, 'tokens']+"(" + temp.loc[i-3, 'label'] +") "+
          temp.loc[i-2, 'tokens']+"(" + temp.loc[i-2, 'label'] +") "+ temp.loc[i-1, 'tokens']+"(" + temp.loc[i-1, 'label'] +") "+
          temp.loc[i, 'tokens']+"(" + temp.loc[i, 'label'] +") "+ temp.loc[i+1, 'tokens']+"(" + temp.loc[i+1, 'label'] +") "+
          temp.loc[i+2, 'tokens']+"(" + temp.loc[i+2, 'label'] +") "+ temp.loc[i+3, 'tokens']+"(" + temp.loc[i+3, 'label'] +") ")