In [1]:
import os
import nltk
import pandas as pd
from unidecode import unidecode

In [2]:
dataPath = ".\\data"

In [3]:
def find_files(search_path):
    result = []
    # Wlaking top-down from the root
    for root, dir, files in os.walk(search_path):
        for file in files:       
                result.append(os.path.join(root, file))
    return result

In [4]:
trainFiles = find_files(dataPath + "\\train")
testFiles = find_files(dataPath + "\\test")
devFiles = find_files(dataPath + "\\dev")

In [13]:
def loadData(paths):
    data = pd.DataFrame(columns=["tokens", "label"])
    for i in paths:
        try:
            doc = pd.read_csv(i, sep="\t", names=["tokens", "label"], header=None)
            doc['file'] = i[12:-5]
            data = pd.concat([data, doc], ignore_index=True)
        except Exception as e: 
            print(i, e)
    return data

In [14]:
train = loadData(trainFiles)
test  = loadData(testFiles)
dev = loadData(devFiles)
print("Training Size before Cleaning")
print("Training Size:\t", train.shape)
print("Test Size:\t", test.shape)
print("Dev Size:\t", dev.shape)

Training Size before Cleaning
Training Size:	 (21549, 3)
Test Size:	 (2781, 3)
Dev Size:	 (2414, 3)


In [15]:
def stripSpaces(x):
    x = unidecode(x)
    specialchar = "!@#$%^&*()[]{};:,./<>?\|`-~=_+\t\n"
    for tag in specialchar:
        x = x.replace(tag, '')
    x = x.replace(" ", "")
    x = x.lower()
    x = x.strip()
    return x

def cleaning(dataset):
    dataset.drop(dataset[dataset["tokens"].isna()].index, inplace=True)
    print("Size after Dropping Null Tokens",dataset.shape)
    print("Tokens Without labels:")
    for indexWithNullLabel in dataset[dataset["label"].isna()].index:
        token = dataset["tokens"][indexWithNullLabel]
        #split with ' ' doesnt consider multiple spaces as one
        tokenslist = token.split()
        dataset["tokens"][indexWithNullLabel] = tokenslist[0]

        if (len(tokenslist) > 1):
            dataset["label"][indexWithNullLabel] = tokenslist[1]
        else:
            print(dataset.loc[indexWithNullLabel, :])
            #Manual Correction for 5467 and 5858 (very & research)
            dataset["label"][indexWithNullLabel] = 'O'
            print("Manual Corrected:", dataset["tokens"][indexWithNullLabel])
    dataset = dataset.applymap(stripSpaces)
    #label to handel 0, i*, b*, o*, 0*
    dataset[dataset["label"] == 'ii'] = 'i'
    dataset[dataset["label"] == '0'] = 'o'
    print("Removing special characters")
    specialCharTokens = dataset[~(dataset["tokens"].str.isalnum())]["tokens"].unique()
    #for sprecialChar with label B, moving label to next row and droping rows  
    specialCharWithB = dataset[dataset["tokens"].isin(specialCharTokens) & (dataset["label"] == 'b')].index
    for i in specialCharWithB:
        dataset.loc[i+1, "label"] = 'b'
    dataset.drop(dataset[dataset["tokens"].isin(specialCharTokens) & ((dataset["label"] == 'o') | (dataset["label"] == 'b') )].index, inplace=True)
    #Drop i where there is i and b before it
    toDrop = []
    for i in dataset[dataset["tokens"].isin(specialCharTokens)].index:
        if(dataset["label"][i-1] == 'b' or dataset["label"][i-1] == 'i' ):
            toDrop.append(i)
        else:
            dataset["label"][i] = 'b'
    dataset.drop(toDrop, axis=0, inplace=True)
    print(dataset.value_counts()[:30])
    print("Removing Stopwords based on above listed most frequent words")
    stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]
    print("Label order correction:")
    dataset.reset_index(drop=True, inplace=True)
    temp = dataset.copy()
    temp["before"] = temp["label"].shift(1)
    temp["after"] = temp["label"].shift(-1)
    for i in temp[(temp["label"] == 'i') & (temp["before"] == 'o') ].index:
            # oio or oii
            print(temp.loc[i-1, "tokens"]+"("+temp.loc[i-1, "label"]+")\t\t", temp.loc[i, "tokens"]+"("+temp.loc[i, "label"]+")\t\t", temp.loc[i+1, "tokens"]+"("+temp.loc[i+1, "label"]+")")
            if(temp.loc[i+1, "label"] == 'o' or temp.loc[i+1, "label"] == 'i'):
                dataset.loc[i, "label"] = 'b'
            # oib
            if(temp.loc[i+1, "label"] == 'b'):
                dataset.loc[i, "label"] = 'b'
                dataset.loc[i+1, "label"] = 'i'
    del temp
    dataset.reset_index(drop=True, inplace=True)
    return dataset

In [16]:
train = cleaning(train)

Size after Dropping Null Tokens (21487, 3)
Tokens Without labels:
tokens                    .
label                   NaN
file      \2009-35-1-29-46.
Name: 107, dtype: object
Manual Corrected: .
tokens                    very
label                      NaN
file      \2013-39-3-511–554 .
Name: 7108, dtype: object
Manual Corrected: very
tokens               research
label                     NaN
file      \2013-39-4-847–884.
Name: 7557, dtype: object
Manual Corrected: research
Removing special characters
tokens  label  file         
the     o      2015411120       23
               2016421121161    22
               2011374657688    21
               2016422245275    21
               20154112140      20
of      o      2016421121161    20
the     o      2009351328       20
               2012383527574    20
               2013391121160    20
of      o      2011374657688    19
the     o      2016424661701    19
               2020461152       19
               2015412185214    18
        

In [18]:
stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]

In [None]:
train[(train["tokens"].isin(stopwords)) & (train["label"] != 'o')].value_counts()

In [None]:
tokens = []
tokens.extend(train["tokens"].tolist())
tokenset = set(tokens)
print("Unique Tokens in Train and dev set: ",len(tokenset))
tokenset
#[x for x in tokenset if len(x)<=1]

In [19]:
#removing stopwords with o to investigate stopwords with b and i
temp = train.copy()
temp.drop(temp[(temp["tokens"].isin(stopwords)) & (temp["label"] == 'o')].index, axis=0, inplace=True)
temp.reset_index(drop=True, inplace=True)

In [20]:
temp[(temp["tokens"].isin(stopwords))].value_counts()
# temp[(temp["tokens"].isin(stopwords))]["tokens"].value_counts().index

tokens  label  file         
to      i      2019452267292    5
the     i      2017434683722    4
               2013394847884    2
               2018443403446    2
and     i      2018443447482    2
to      i      20093512946      2
and     i      2017431181200    2
               2016424661701    2
to      i      2021472309332    2
               2021472445476    2
a       i      2017434781835    2
the     i      2017434781835    2
               2017431125179    1
               2016421121161    1
               2017431181200    1
               201743171123     1
               2017433521565    1
a       i      2017434683722    1
the     i      2017434723760    1
               2018442349374    1
               20133912355      1
their   i      2017431181200    1
to      i      2014402349401    1
               2020461152       1
with    i      2017433465520    1
the     i      2019451163197    1
               2010363303339    1
               2013391195227    1
are     i      2017

In [27]:
# temp[(temp["tokens"].isin(stopwords))].value_counts()
# of and the is often used in between 
#Arabic NER only NER is Keyword

for i in temp[(temp["tokens"].isin(temp[(temp["tokens"].isin(stopwords))]["tokens"].value_counts().index))].index:
    print(temp.loc[i, 'file'] +":"+
          temp.loc[i-2, 'tokens']+"(" + temp.loc[i-2, 'label'] +") "+ temp.loc[i-1, 'tokens']+"(" + temp.loc[i-1, 'label'] +") "+
          temp.loc[i, 'tokens']+"(" + temp.loc[i, 'label'] +") "+ temp.loc[i+1, 'tokens']+"(" + temp.loc[i+1, 'label'] +") "+
          temp.loc[i+2, 'tokens']+"(" + temp.loc[i+2, 'label'] +") "+ temp.loc[i+3, 'tokens']+"(" + temp.loc[i+3, 'label'] +") ")

20093512946:both(o) text(b) to(i) text(i) concept(b) to(i) 
20093512946:text(i) concept(b) to(i) text(i) generation(i) systems(i) 
2009353313343:of(o) example(o) in(b) domain(i) training(i) sentences(i) 
201036171109:of(o) extrinsic(b) and(i) intrinsic(i) measures(i) outcome(o) 
2010362203227:state(b) of(i) the(i) art(i) classification(b) method(i) 
2010362247277:state(b) of(i) the(i) art(i) syntax(b) mt(i) 
2010363303339:state(b) of(i) the(i) art(i) alignment(b) quality(i) 
2013391195227:state(b) of(i) the(i) art(i) system(o) based(o) 
20133912355:state(b) of(i) the(i) art(i) data(b) driven(i) 
2013394847884:state(b) of(i) the(i) art(i) such(o) linking(b) 
2013394847884:state(b) of(i) the(i) art(i) level(o) participated(o) 
2014401171202:called(o) arabic(b) on(i) line(i) commentary(i) data(b) 
201440185120:especially(o) sparse(b) and(i) polysemous(i) words(i) frame(b) 
2014402349401:languages(b) tree(b) to(i) tree(i) translation(b) evidence(o) 
2016421121161:state(b) of(i) the(i) art(