In [35]:
import os
import nltk
import pandas as pd
import numpy as np
from sklearn import preprocessing, pipeline, svm, linear_model, neighbors, metrics, ensemble
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, datasets
import copy
from unidecode import unidecode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [36]:
dataPath = ".\\data"

In [37]:
def find_files(search_path):
    result = []
    # Wlaking top-down from the root
    for root, dir, files in os.walk(search_path):
        for file in files:       
                result.append(os.path.join(root, file))
    return result

In [38]:
trainFiles = find_files(dataPath + "\\train")
testFiles = find_files(dataPath + "\\test")
devFiles = find_files(dataPath + "\\dev")

In [39]:
def loadData(paths):
    data = pd.DataFrame(columns=["tokens", "label"])
    for i in paths:
        try:
            doc = pd.read_csv(i, sep="\t", names=["tokens", "label"], header=None)
            doc['file'] = i[12:-5]
            data = pd.concat([data, doc], ignore_index=True)
        except Exception as e: 
            print(i, e)
    return data

In [40]:
train = loadData(trainFiles)
test  = loadData(testFiles)
dev = loadData(devFiles)
print("Training Size before Cleaning")
print("Training Size:\t", train.shape)
print("Test Size:\t", test.shape)
print("Dev Size:\t", dev.shape)

Training Size before Cleaning
Training Size:	 (21548, 3)
Test Size:	 (2781, 3)
Dev Size:	 (2414, 3)


In [41]:
stopwords = ["about", "all", "also", "among", "at", "available", "be", "because", "been", "both", "but", "by", "can", "each", "first", "has", "have", "here", "how",
             "however", "into", "it", "its", "large", "learn", "many", "may", "more", "most", "much", "new", "not", "often", "only", "or", "other", "over", "recent", "related", "same",
             "several", "shown", "some", "studies", "such", "than", "their", "them", "then", "there", "these", "they", "those", "through", "use", "used", "we", "well", "what",
             "when", "where", "which"]
def stripSpaces(x):
    x = unidecode(x)
    specialchar = "!@#$%^&*()[]{};:,./<>?\|`-~=_+\t\n"
    for tag in specialchar:
        x = x.replace(tag, '')
    x = x.replace(" ", "")
    x = x.lower()
    x = x.strip()
    return x

def cleaning(_dataset, verbose=True):
    dataset = _dataset.copy()
    dataset.drop(dataset[dataset["tokens"].isna()].index, inplace=True)
    if verbose: 
        print("Size after Dropping Null Tokens",dataset.shape)
        print("Tokens Without labels:")
    for indexWithNullLabel in dataset[dataset["label"].isna()].index:
        token = dataset["tokens"][indexWithNullLabel]
        #split with ' ' doesnt consider multiple spaces as one
        tokenslist = token.split()
        dataset["tokens"][indexWithNullLabel] = tokenslist[0]

        if (len(tokenslist) > 1):
            dataset["label"][indexWithNullLabel] = tokenslist[1]
        else:
            if verbose:
                print(dataset.loc[indexWithNullLabel, :])
            #Manual Correction for 5467 and 5858 (very & research)
            dataset["label"][indexWithNullLabel] = 'O'
            if verbose:
                print("Manual Corrected:", dataset["tokens"][indexWithNullLabel])
    dataset = dataset.applymap(stripSpaces)
    #label to handel 0, i*, b*, o*, 0*
    dataset[dataset["label"] == 'ii'] = 'i'
    dataset[dataset["label"] == '0'] = 'o'
    if verbose:
        print("Removing special characters")
    specialCharTokens = dataset[~(dataset["tokens"].str.isalnum())]["tokens"].unique()
    #for sprecialChar with label B, moving label to next row and droping rows  
    specialCharWithB = dataset[dataset["tokens"].isin(specialCharTokens) & (dataset["label"] == 'b')].index
    for i in specialCharWithB:
        dataset.loc[i+1, "label"] = 'b'
    dataset.drop(dataset[dataset["tokens"].isin(specialCharTokens) & ((dataset["label"] == 'o') | (dataset["label"] == 'b') )].index, inplace=True)
    #Drop i where there is i and b before it
    toDrop = []
    for i in dataset[dataset["tokens"].isin(specialCharTokens)].index:
        if(dataset["label"][i-1] == 'b' or dataset["label"][i-1] == 'i' ):
            toDrop.append(i)
        else:
            dataset["label"][i] = 'b'
    dataset.drop(toDrop, axis=0, inplace=True)
    if verbose:
        print(dataset.value_counts()[:30])
        print("Removing Stopwords based on above listed most frequent words")
    stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]
    if verbose:
        print("Label order correction:")
    dataset.reset_index(drop=True, inplace=True)
    temp = dataset.copy()
    temp["before"] = temp["label"].shift(1)
    temp["after"] = temp["label"].shift(-1)
    for i in temp[(temp["label"] == 'i') & (temp["before"] == 'o') ].index:
            # oio or oii
            if verbose:
                print(temp.loc[i-1, "tokens"]+"("+temp.loc[i-1, "label"]+")\t\t", temp.loc[i, "tokens"]+"("+temp.loc[i, "label"]+")\t\t", temp.loc[i+1, "tokens"]+"("+temp.loc[i+1, "label"]+")")
            if(temp.loc[i+1, "label"] == 'o' or temp.loc[i+1, "label"] == 'i'):
                dataset.loc[i, "label"] = 'b'
            # oib
            if(temp.loc[i+1, "label"] == 'b'):
                dataset.loc[i, "label"] = 'b'
                dataset.loc[i+1, "label"] = 'i'
    del temp
    if verbose:
        print(dataset[(dataset["tokens"].isin(stopwords))]["tokens"].value_counts().index)
    dataset.drop(dataset[dataset["tokens"].isin(dataset)].index, inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    return dataset

In [42]:
train = cleaning(train, verbose=False)
test = cleaning(test, verbose=False)
dev = cleaning(dev, verbose=False)

In [43]:
# of and the is often used in between 
# And seprates the Keywords may be good to not remove index [:]
# Arabic NER only NER is Keyword
# languages, multiple languages, as keyword
# linguistic phenomena across languages
#
#
#
#
# for i in temp[(temp["tokens"].isin(temp[(temp["tokens"].isin(stopwords))]["tokens"].value_counts().index)) & (temp["label"] != 'o')].index[:]:
#     print(temp.loc[i, 'file'] +"(",i,"):"+
#           temp.loc[i-4, 'tokens']+"(" + temp.loc[i-4, 'label'] +") "+ temp.loc[i-3, 'tokens']+"(" + temp.loc[i-3, 'label'] +") "+
#           temp.loc[i-2, 'tokens']+"(" + temp.loc[i-2, 'label'] +") "+ temp.loc[i-1, 'tokens']+"(" + temp.loc[i-1, 'label'] +") "+
#           temp.loc[i, 'tokens']+"(" + temp.loc[i, 'label'] +") "+ temp.loc[i+1, 'tokens']+"(" + temp.loc[i+1, 'label'] +") "+
#           temp.loc[i+2, 'tokens']+"(" + temp.loc[i+2, 'label'] +") "+ temp.loc[i+3, 'tokens']+"(" + temp.loc[i+3, 'label'] +") ")
#
#
#
#Removing these from stopwords as 
# there may be contained in a key word
#and, of, on, all, for
# leading tokens for keywords
#the, a, and, for, in, to, on, that, as, with, an, our, from, this, is, are 

In [44]:
def featurePreparation(_dataset):
    dataset = _dataset.copy()
    dataset["t1"] = dataset["tokens"].shift(3)
    dataset["t2"] = dataset["tokens"].shift(2)
    dataset["t3"] = dataset["tokens"].shift()
    dataset["t4"] = dataset["tokens"] #term to classify
    dataset["t5"] = dataset["tokens"].shift(-1)
    dataset["t6"] = dataset["tokens"].shift(-2)
    dataset["t7"] = dataset["tokens"].shift(-3)
    dataset["file2"] = dataset["file"].shift()
    dataset.loc[dataset[dataset["file"] != dataset["file2"]].index, ["t1", "t2", "t3"]] = [float('nan'), float('nan'), float('nan')]
    dataset = dataset.drop(["tokens", "file2", "file"], axis=1)
    return dataset

In [45]:
train = featurePreparation(train)
test = featurePreparation(test)
dev = featurePreparation(dev)

In [46]:
# #investigating small tokens
# print("len(Tokens) <= 1 and is term")
# train[(train["t4"].str.len() == 1) & (train["label"] != 'o')]["t4"].unique()
# print("len(Tokens) = 1")
# train[(train["t4"].str.len() == 1)]["t4"].unique()
# print("len(Tokens) = 2")
# train[(train["t4"].str.len() == 2) & (train["label"] != 'o')]["t4"].unique()
# print("len(Tokens) = 2 and is term")
# train[(train["t4"].str.len() == 2)]["t4"].unique()
# print("len(Tokens) = 1")
# train[(train["t4"].str.len() == 3) & (train["label"] != 'o')]["t4"].unique()
# print("len(Tokens) = 1 and is term")
# train[(train["t4"].str.len() == 3)]["t4"].unique()

In [47]:
tokens = train.t4.unique()
tokens = np.append(tokens, '_nan')
tokens = np.append(tokens, 'new_token')
decoderDic = dict(enumerate(tokens, 1))
encoderDic = {v: k for k, v in decoderDic.items()}
labelencodeing = {'b': 0, 'i': 1, 'o': 2}
labeldecodeing = {v: k for k, v in labelencodeing.items()}

In [48]:
def encode(token):
    try:
        return encoderDic[token]
    except:
        if(token == float('nan')):
            return encoderDic["_nan"]
        else:
            return encoderDic["new_token"]
def decode(embedding):
    try:
        return decoderDic[embedding]
    except:
        if(embedding == encoderDic["_nan"]):
            return decoderDic["_nan"]
        elif(embedding == encoderDic["new_token"]):
            return decoderDic["new_token"]

scaler = preprocessing.StandardScaler()
        
def encodeDataset(_dataset, train=False):
    dataset = _dataset.copy()
    dataset["t1"] = dataset["t1"].map(encode)
    dataset["t2"] = dataset["t2"].map(encode) 
    dataset["t3"] = dataset["t3"].map(encode) 
    dataset["t4"] = dataset["t4"].map(encode) 
    dataset["t5"] = dataset["t5"].map(encode) 
    dataset["t6"] = dataset["t6"].map(encode) 
    dataset["t7"] = dataset["t7"].map(encode)
    dataset["label"] = dataset["label"].map(lambda x: labelencodeing[x] )
    return dataset["label"], dataset[["t1","t2","t3","t4","t5","t6","t7"]]

In [49]:
trainY, trainX = encodeDataset(train)
devY, devX = encodeDataset(dev)
testY, testX = encodeDataset(test)

In [50]:
pipe = pipeline.make_pipeline(preprocessing.StandardScaler(),  ensemble.RandomForestClassifier())
pipe.fit(trainX, trainY)
y_hat = pipe.predict(testX) 

In [34]:
print(metrics.classification_report(testY, y_hat, target_names=['B', 'I', 'O']))

              precision    recall  f1-score   support

           B       0.49      0.22      0.30       322
           I       0.54      0.24      0.33       315
           O       0.79      0.95      0.86      1823

    accuracy                           0.76      2460
   macro avg       0.61      0.47      0.50      2460
weighted avg       0.72      0.76      0.72      2460



In [51]:
metrics.confusion_matrix(testY, y_hat)

array([[  62,   28,  232],
       [  20,   75,  220],
       [  55,   41, 1727]], dtype=int64)