In [2]:
import os
import nltk
import pandas as pd
import numpy as np
from sklearn import preprocessing, pipeline, svm, linear_model, neighbors, metrics, ensemble
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, datasets
from nltk.stem import WordNetLemmatizer
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag
from nltk.corpus import stopwords as sw, wordnet as wn
import re
import string 
import copy
from unidecode import unidecode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
dataPath = ".\\data"

In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [4]:
# print("Training Size before Cleaning")
# print("Training Size:\t", train.shape)
# print("Test Size:\t", test.shape)
# print("Dev Size:\t", dev.shape)

In [67]:
stopwords = ["about", "all", "also", "among", "at", "available", "be", "because", "been", "both", "but", "by", "can", "each", "first", "has", "have", "here", "how",
             "however", "into", "it", "its", "large", "learn", "many", "may", "more", "most", "much", "new", "not", "often", "only", "or", "other", "over", "recent", "related", "same",
             "several", "shown", "some", "studies", "such", "than", "their", "them", "then", "there", "these", "they", "those", "through", "use", "used", "we", "well", "what",
             "when", "where", "which"]
def find_files(search_path):
    result = []
    # Walking top-down from the root
    for root, dir, files in os.walk(search_path):
        for file in files:       
                result.append(os.path.join(root, file))
    return result

def loadData(paths):
    data = pd.DataFrame(columns=["tokens", "label"])
    for i in paths:
        try:
            doc = pd.read_csv(i, sep="\t", names=["tokens", "label"], header=None)
            doc['file'] = i[12:-5]
            data = pd.concat([data, doc], ignore_index=True)
        except Exception as e: 
            print(i, e)
    return data

def stripSpaces(x):
    x = unidecode(x)
    specialchar = "!@#$%^&*()[]{};:,./<>?\|`-~=_+\t\n"
    for tag in specialchar:
        x = x.replace(tag, '')
    x = x.replace(" ", "")
    x = x.lower()
    x = x.strip()
    return x

def cleaning(_dataset, lemma=True, pos=False, verbose=True):
    dataset = _dataset.copy()
    dataset.drop(dataset[dataset["tokens"].isna()].index, inplace=True)
    if verbose: 
        print("Size after Dropping Null Tokens",dataset.shape)
        print("Tokens Without labels:")
    for indexWithNullLabel in dataset[dataset["label"].isna()].index:
        token = dataset["tokens"][indexWithNullLabel]
        #split with ' ' doesnt consider multiple spaces as one
        tokenslist = token.split()
        dataset["tokens"][indexWithNullLabel] = tokenslist[0]

        if (len(tokenslist) > 1):
            dataset["label"][indexWithNullLabel] = tokenslist[1]
        else:
            if verbose:
                print(dataset.loc[indexWithNullLabel, :])
            #Manual Correction for 5467 and 5858 (very & research)
            dataset["label"][indexWithNullLabel] = 'O'
            if verbose:
                print("Manual Corrected:", dataset["tokens"][indexWithNullLabel])
    dataset = dataset.applymap(stripSpaces)
    #label to handel 0, i*, b*, o*, 0*
    dataset[dataset["label"] == 'ii'] = 'i'
    dataset[dataset["label"] == '0'] = 'o'
    if verbose:
        print("Removing special characters")
    specialCharTokens = dataset[~(dataset["tokens"].str.isalnum())]["tokens"].unique()
    #for sprecialChar with label B, moving label to next row and droping rows  
    specialCharWithB = dataset[dataset["tokens"].isin(specialCharTokens) & (dataset["label"] == 'b')].index
    for i in specialCharWithB:
        dataset.loc[i+1, "label"] = 'b'
    dataset.drop(dataset[dataset["tokens"].isin(specialCharTokens) & ((dataset["label"] == 'o') | (dataset["label"] == 'b') )].index, inplace=True)
    #Drop i where there is i and b before it
    toDrop = []
    for i in dataset[dataset["tokens"].isin(specialCharTokens)].index:
        if(dataset["label"][i-1] == 'b' or dataset["label"][i-1] == 'i' ):
            toDrop.append(i)
        else:
            dataset["label"][i] = 'b'
    dataset.drop(toDrop, axis=0, inplace=True)
    if verbose:
        print(dataset.value_counts()[:30])
        print("Removing Stopwords based on above listed most frequent words")
    stopwords = ["the","this","that","has","have","can","be","in","on","at","to","as","is","are","a","an","with","our","we","from","which","when","also","and","or","not","it","its",
                 "than","use","into","how","but","to","for","their","there","all"]
    if verbose:
        print("Label order correction:")
    dataset.reset_index(drop=True, inplace=True)
    temp = dataset.copy()
    temp["before"] = temp["label"].shift(1)
    temp["after"] = temp["label"].shift(-1)
    for i in temp[(temp["label"] == 'i') & (temp["before"] == 'o') ].index:
            # oio or oii
            if verbose:
                print(temp.loc[i-1, "tokens"]+"("+temp.loc[i-1, "label"]+")\t\t", temp.loc[i, "tokens"]+"("+temp.loc[i, "label"]+")\t\t", temp.loc[i+1, "tokens"]+"("+temp.loc[i+1, "label"]+")")
            if(temp.loc[i+1, "label"] == 'o' or temp.loc[i+1, "label"] == 'i'):
                dataset.loc[i, "label"] = 'b'
            # oib
            if(temp.loc[i+1, "label"] == 'b'):
                dataset.loc[i, "label"] = 'b'
                dataset.loc[i+1, "label"] = 'i'
    del temp
    if verbose:
        print(dataset[(dataset["tokens"].isin(stopwords))]["tokens"].value_counts().index)
    dataset.drop(dataset[dataset["tokens"].isin(dataset)].index, inplace=True)
    if pos:
        dataset["POS"] = [x[1] for x in nltk.pos_tag(dataset["tokens"])]
    if lemma:
        lemmatizer = WordNetLemmatizer()
        dataset["tokens"] = dataset["tokens"].apply(lambda x: lemmatizer.lemmatize(x))
    dataset.reset_index(drop=True, inplace=True)
    return dataset

def featurePreparation(_dataset, ref=1):
    dataset = _dataset.copy()
    if  ref == 0:
        dataset["text"] = dataset["tokens"]
    elif ref == 1:
        dataset["text"] = dataset["tokens"].shift(fill_value= "") + " " + dataset["tokens"] \
                            + " " + dataset["tokens"].shift(-1, fill_value= "")
    elif ref == 2:
        dataset["text"] = dataset["tokens"].shift(2, fill_value= "") + " " + dataset["tokens"].shift(fill_value= "") \
                            + " " + dataset["tokens"] + " " + dataset["tokens"].shift(-1, fill_value= "") \
                            + " " + dataset["tokens"].shift(-2, fill_value= "")
    elif ref == 3:
        dataset["text"] = dataset["tokens"].shift(3, fill_value= "") + " " + dataset["tokens"].shift(2, fill_value= "") \
                            + " " + dataset["tokens"].shift(fill_value= "") + " " + dataset["tokens"] \
                            + " " + dataset["tokens"].shift(-1, fill_value= "") \
                            + " " + dataset["tokens"].shift(-2, fill_value= "") + " " + dataset["tokens"].shift(-3, fill_value= "")
    dataset["text"] = dataset["tokens"]
    dataset = dataset.drop(["tokens", "file"], axis=1)
    return dataset

def pre_pipeline(ref=1, pos=False,lemma=True):
    trainFiles = find_files(dataPath + "\\train")
    testFiles = find_files(dataPath + "\\test")
    devFiles = find_files(dataPath + "\\dev")
    train = loadData(trainFiles)
    test  = loadData(testFiles)
    dev = loadData(devFiles)
    train = cleaning(train, lemma=lemma, pos=pos, verbose=False)
    test = cleaning(test, lemma=lemma, pos=pos, verbose=False)
    dev = cleaning(dev, lemma=lemma, pos=pos, verbose=False)
    train = featurePreparation(train, ref=ref)
    test = featurePreparation(test, ref=ref)
    dev = featurePreparation(dev, ref=ref)
    return train, test, dev

In [89]:
train, test, dev = pre_pipeline(pos=True,lemma=False, ref=3)

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(train["POS"].values.reshape(-1,1))
POS_hot = enc.transform(train["POS"].values.reshape(-1,1))
vectorizer = CountVectorizer()
trainX = vectorizer.fit_transform(train["text"])
print("Vobac Size", len(vectorizer.get_feature_names_out()))

POS_hotTest = enc.transform(test["POS"].values.reshape(-1,1))
POS_hotDev = enc.transform(dev["POS"].values.reshape(-1,1))
devX = vectorizer.transform(dev["text"])
testX = vectorizer.transform(test["text"])

trainX = np.c_[trainX.toarray(), POS_hot.toarray()]
testX = np.c_[testX.toarray(), POS_hotTest.toarray()]
devX = np.c_[devX.toarray(), POS_hotDev.toarray()]

le = preprocessing.LabelEncoder()
le.fit(["b", "i", "o"])
trainY = le.transform(train["label"])


devY = le.transform(dev["label"])

testY = le.transform(test["label"])

pipe = pipeline.make_pipeline(ensemble.RandomForestClassifier())
pipe.fit(trainX, trainY)
y_hat = pipe.predict(testX) 
print(metrics.classification_report(le.inverse_transform(testY), le.inverse_transform(y_hat)))
metrics.confusion_matrix(le.inverse_transform(testY), le.inverse_transform(y_hat))

              precision    recall  f1-score   support

           b       0.54      0.50      0.52       322
           i       0.53      0.38      0.44       315
           o       0.88      0.93      0.90      1823

    accuracy                           0.80      2460
   macro avg       0.65      0.60      0.62      2460
weighted avg       0.79      0.80      0.79      2460



array([[ 160,   51,  111],
       [  66,  120,  129],
       [  69,   56, 1698]], dtype=int64)

In [None]:
train, test, dev = pre_pipeline(pos=True,lemma=False, ref=2)

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(train["POS"].values.reshape(-1,1))
POS_hot = enc.transform(train["POS"].values.reshape(-1,1))
vectorizer = CountVectorizer()
trainX = vectorizer.fit_transform(train["text"])
print("Vobac Size", len(vectorizer.get_feature_names_out()))

POS_hotTest = enc.transform(test["POS"].values.reshape(-1,1))
POS_hotDev = enc.transform(dev["POS"].values.reshape(-1,1))
devX = vectorizer.transform(dev["text"])
testX = vectorizer.transform(test["text"])

trainX = np.c_[trainX.toarray(), POS_hot.toarray()]
testX = np.c_[testX.toarray(), POS_hotTest.toarray()]
devX = np.c_[devX.toarray(), POS_hotDev.toarray()]

le = preprocessing.LabelEncoder()
le.fit(["b", "i", "o"])
trainY = le.transform(train["label"])


devY = le.transform(dev["label"])

testY = le.transform(test["label"])

pipe = pipeline.make_pipeline(ensemble.RandomForestClassifier())
pipe.fit(trainX, trainY)
y_hat = pipe.predict(testX) 
print(metrics.classification_report(le.inverse_transform(testY), le.inverse_transform(y_hat)))
metrics.confusion_matrix(le.inverse_transform(testY), le.inverse_transform(y_hat))

Vobac Size 2980


In [None]:
train, test, dev = pre_pipeline(pos=True,lemma=False, ref=1)

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(train["POS"].values.reshape(-1,1))
POS_hot = enc.transform(train["POS"].values.reshape(-1,1))
vectorizer = CountVectorizer()
trainX = vectorizer.fit_transform(train["text"])
print("Vobac Size", len(vectorizer.get_feature_names_out()))

POS_hotTest = enc.transform(test["POS"].values.reshape(-1,1))
POS_hotDev = enc.transform(dev["POS"].values.reshape(-1,1))
devX = vectorizer.transform(dev["text"])
testX = vectorizer.transform(test["text"])

trainX = np.c_[trainX.toarray(), POS_hot.toarray()]
testX = np.c_[testX.toarray(), POS_hotTest.toarray()]
devX = np.c_[devX.toarray(), POS_hotDev.toarray()]

le = preprocessing.LabelEncoder()
le.fit(["b", "i", "o"])
trainY = le.transform(train["label"])


devY = le.transform(dev["label"])

testY = le.transform(test["label"])

pipe = pipeline.make_pipeline(ensemble.RandomForestClassifier())
pipe.fit(trainX, trainY)
y_hat = pipe.predict(testX) 
print(metrics.classification_report(le.inverse_transform(testY), le.inverse_transform(y_hat)))
metrics.confusion_matrix(le.inverse_transform(testY), le.inverse_transform(y_hat))

In [None]:
train, test, dev = pre_pipeline(pos=True,lemma=False, ref=0)

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(train["POS"].values.reshape(-1,1))
POS_hot = enc.transform(train["POS"].values.reshape(-1,1))
vectorizer = CountVectorizer()
trainX = vectorizer.fit_transform(train["text"])
print("Vobac Size", len(vectorizer.get_feature_names_out()))

POS_hotTest = enc.transform(test["POS"].values.reshape(-1,1))
POS_hotDev = enc.transform(dev["POS"].values.reshape(-1,1))
devX = vectorizer.transform(dev["text"])
testX = vectorizer.transform(test["text"])

trainX = np.c_[trainX.toarray(), POS_hot.toarray()]
testX = np.c_[testX.toarray(), POS_hotTest.toarray()]
devX = np.c_[devX.toarray(), POS_hotDev.toarray()]

le = preprocessing.LabelEncoder()
le.fit(["b", "i", "o"])
trainY = le.transform(train["label"])


devY = le.transform(dev["label"])

testY = le.transform(test["label"])

pipe = pipeline.make_pipeline(ensemble.RandomForestClassifier())
pipe.fit(trainX, trainY)
y_hat = pipe.predict(testX) 
print(metrics.classification_report(le.inverse_transform(testY), le.inverse_transform(y_hat)))
metrics.confusion_matrix(le.inverse_transform(testY), le.inverse_transform(y_hat))