In [2]:
# COMP 551 - Final Project
# Track 1, Paper 11

# Sebastian Andrade - 260513637
# Patrick Beland    - 260688796
# Bogdan Dumitru    - 260690446

In [17]:
#Common Imports
import csv
import os
import operator
from random import shuffle
from math import log2
import math, sys, random, string, re, csv
from glob import glob
import warnings
warnings.filterwarnings("ignore")

#Data Processing
import numpy as np
import pandas as pd

#Quality of life module showing time to completion of cell
from tqdm import tqdm

#NLTK imports
import nltk
from nltk.tbl.template import Template
from nltk.tag import RegexpTagger, BrillTaggerTrainer
from nltk.tag.brill import Pos, Word
from nltk.tag import UnigramTagger
from nltk.tag import tnt
from nltk import ngrams
from nltk import word_tokenize
nltk.download('punkt') #no need to run this line every time

#SciPy imports
from scipy.sparse import csr_matrix
from scipy.stats import entropy

#Scikit learn imports
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestCentroid
import sklearn.metrics 
from sklearn.metrics.pairwise import distance_metrics 
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

#Useful base directory path for importing files
base_dir = os.path.dirname(os.path.realpath('_file_'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bogdan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# Baseline Implementation for Task 1 of Paper 11 - Language Identification

## Reads all text files in given directory
def process_folder(directory):
    data = []
    chars = []
    filename_order = []
    for filename in sorted(os.listdir(directory)):
        with open(os.path.join(directory, filename), encoding='utf-8') as f:
                par = []
                car = []
                content = f.read()
                for word in list(content):
                    par.append(ord(word))
                    car.append(word)
                filename_order.append(filename)
                chars.append(car)
                data.append(par)
    return data, filename_order, chars
        
def write_to_file(file_name, data):
    with open(file_name, "w", newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter=' ')
        for row in data:
            writer.writerow(row)
    return 

## Represents text as tokens of n characters
def get_tokens(dataset, n):
    total_tokens = {}
    for paragraph in tqdm(dataset):
        str1 = ' '.join(str(e) for e in paragraph)
        tokens = word_tokenize(str1)
        grams = ngrams(tokens, n)
        for gram in grams:
            if gram in total_tokens:
                total_tokens[gram] = total_tokens.get(gram) + 1
            else:
                total_tokens[gram] = 1
    return total_tokens       

## Generates the frequency bag of words of N-grams
def make_tokens(stuff):
    total_tokens1 = get_tokens(stuff, 1)
    total_tokens2 = get_tokens(stuff, 2)
    total_tokens3 = get_tokens(stuff, 3)
    total_tokens4 = get_tokens(stuff, 4)
    total_tokens5 = get_tokens(stuff, 5)
    
    sorted_tokens1 = sorted(total_tokens1.items(), key=operator.itemgetter(1), reverse=True)
    sorted_tokens2 = sorted(total_tokens2.items(), key=operator.itemgetter(1), reverse=True)
    sorted_tokens3 = sorted(total_tokens3.items(), key=operator.itemgetter(1), reverse=True)
    sorted_tokens4 = sorted(total_tokens4.items(), key=operator.itemgetter(1), reverse=True)
    sorted_tokens5 = sorted(total_tokens5.items(), key=operator.itemgetter(1), reverse=True)

    sorted_tokens1 = sorted_tokens1[:1000]
    sorted_tokens2 = sorted_tokens2[:1000]
    sorted_tokens3 = sorted_tokens3[:1000]
    sorted_tokens4 = sorted_tokens4[:1000]
    sorted_tokens5 = sorted_tokens5[:1000]
    return [sorted_tokens1, sorted_tokens2, sorted_tokens3, sorted_tokens4, sorted_tokens5]

## Converts vocabuly into a dictionary for convenience
def make_dict(vocabs):
    list_dix = []
    for entry in vocabs:
        dick = {}
        for index, tup in enumerate(entry):
            dick[tup[0]] = index
        list_dix.append(dick)
    return list_dix

def load_vocab(file):
    vocab = []
    with open(file) as f:
        for line in f:
            cur_line = line.split()
            vocab.append((cur_line[0], cur_line[1], cur_line[2]))

## Creates 5000 features representing each text for the whole dataset
def featurize(data):
    counter = 0
    converted_data = []
    for paragraph in tqdm(data):
        str1 = ' '.join(str(e) for e in paragraph)
        if counter % 1000 == 0:
            print(str(int(counter / 1000) + 1) + " out of " + str(len(data)/1000))
        tokens = word_tokenize(str1)
        cur_paragraph = np.zeros(5000)
        for n in range(5):
            grams = ngrams(tokens, n+1)
            for gram in grams:
                if gram in total_vocab[n]:
                    cur_index = n * 1000 + total_vocab[n][gram]
                    cur_paragraph[cur_index] += 1
        counter += 1
        # normalize(cur_paragraph[:,np.newaxis], axis=0).ravel()
        #print(cur_paragraph)
        #print("norm: ")
        result =  normalize(cur_paragraph[:,np.newaxis], norm='l1', axis=0).ravel()
        
        #norm = np.linalg.norm(cur_paragraph)
        #print(norm2)
        #result = cur_paragraph/ norm2
        converted_data.append(result )
        #print("result: ")
        #print(result)
    return 

## Maps string labels to integer
def label_to_int(str_labels):
    lbls = [] 
    for lbl in str_labels:
        lbls.append(label_list.index(lbl))
    return np.array(lbls)


## Calculates the skewdivergence of given inputs
# alpha is the interpolation parameter
#epsilon ensures that there wont be divisions by zero
def skewDivergence(all_x, all_y=None, alpha=0.99, epsilon = 0.0000000000000000000001):
    result_labels = []
    for x in tqdm(all_x):
        min_dist = 1000000000
        
        for label, y in enumerate(all_y):
            y =  alpha * y + (1-alpha) * x + epsilon
            x = x + epsilon
            dist = 0
            for idx, yi in enumerate(y):
                dist += (x[idx] *(log2(x[idx]) - log2(yi)))
            if(dist < min_dist ):
                min_dist = dist
                lbl = label
        result_labels.append(lbl)
    return result_labels

In [11]:
#Language Identification Model Implementation

#Wikipedia dataset, included in submission
directory_wikipedia =  base_dir + '/naacl2010-langid/Wikipedia/'
filename_meta_wiki =  base_dir + '/naacl2010-langid/Wikipedia.meta'

data_wiki, order_filename, chars_wiki = process_folder(directory_wikipedia)

wiki_label = {}
with open(filename_meta_wiki, encoding='utf-8') as f:
    for line in f:
        a = line.split('\t')
        wiki_label[a[0]]=a[2]
        

trn = make_tokens(data_wiki)
dic_trn = make_dict(trn)
total_vocab = dic_trn
  
wiki_ds_features = featurize(data_wiki)

labels = []
for fn in order_filename:
    labels.append(wiki_label[fn])

label_list = (list(set(labels)))
    
    
labeled_data = []
for i in range(len(wiki_ds_features)):
    labeled_data.append( (wiki_ds_features[i], labels[i]) )
    
shuffle(labeled_data)

factor = 0.8
factor_ = 0.1

lmt = int(factor*len(labeled_data))
lmt_ = int( (factor+factor_) *len(labeled_data))


trn_x = []
tst_x = []
trn_y = []
tst_y = []
val_x = []
val_y = []

for idx, entry in enumerate(labeled_data):
    if(idx < lmt):
        #trn
        trn_x.append(entry[0])
        trn_y.append(entry[1])
    else:
        if(idx< lmt_):
            tst_x.append(entry[0])
            tst_y.append(entry[1])
        else:
            val_x.append(entry[0])
            val_y.append(entry[1])
        
trn_x_not_sparse = trn_x
tst_x_not_sparse = tst_x
trn_x = csr_matrix(trn_x)
tst_x = csr_matrix(tst_x)

tst_y = label_to_int(tst_y)
trn_y = label_to_int(trn_y)

distances = distance_metrics()

dictlist = []
for key, value in distances.items():
    dictlist.append(key)

dictlist.remove('manhattan')
dictlist.remove('precomputed')

clf = NearestCentroid()
clf.fit(trn_x_not_sparse, trn_y)
centroids = clf.centroids_

pred_y= skewDivergence(tst_x_not_sparse, centroids)

100%|██████████| 4963/4963 [00:45<00:00, 109.15it/s]
100%|██████████| 4963/4963 [00:47<00:00, 104.08it/s]
100%|██████████| 4963/4963 [01:11<00:00, 69.41it/s]
100%|██████████| 4963/4963 [00:50<00:00, 98.32it/s]
100%|██████████| 4963/4963 [00:54<00:00, 91.75it/s]
  0%|          | 0/4963 [00:00<?, ?it/s]

1 out of 4.963


 20%|██        | 998/4963 [00:31<02:04, 31.81it/s]

2 out of 4.963


 40%|████      | 1999/4963 [00:54<01:21, 36.36it/s]

3 out of 4.963


 60%|██████    | 2994/4963 [01:14<00:48, 40.21it/s]

4 out of 4.963


 80%|████████  | 3993/4963 [01:27<00:21, 45.75it/s]

5 out of 4.963


100%|██████████| 4963/4963 [01:39<00:00, 49.71it/s]


NameError: name 'distance_metrics' is not defined

In [18]:
#Language Identification Test Results

score = f1_score(tst_y, pred_y, average='macro')  
acc = accuracy_score(tst_y, pred_y)
print("Metric: SkewDivergence\tF1 Score: " + str(score) + "\tAccuracy: " + str(acc) )


clf_met = []
for idx, i in enumerate(dictlist):
    clf_met.append(NearestCentroid(metric=i))
    clf_met[idx].fit(trn_x, trn_y)
    y_pred= clf_met[idx].predict(tst_x)
    score = f1_score(tst_y, y_pred, average='macro')  
    acc = accuracy_score(tst_y, y_pred)
    print("Metric: " + i + "      \tF1 Score: " + str(score) + "\tAccuracy: " + str(acc) )  

Metric: SkewDivergence	F1 Score: 0.558739332650623	Accuracy: 0.8844221105527639
Metric: cityblock      	F1 Score: 0.4694705419201302	Accuracy: 0.7788944723618091
Metric: cosine      	F1 Score: 0.535226725369209	Accuracy: 0.7889447236180904
Metric: euclidean      	F1 Score: 0.5345437065378414	Accuracy: 0.7889447236180904
Metric: l2      	F1 Score: 0.5345437065378414	Accuracy: 0.7889447236180904
Metric: l1      	F1 Score: 0.4694705419201302	Accuracy: 0.7788944723618091


In [19]:
def file_len(file_name):
    with open(file_name, encoding='utf8') as f:
        for i, n in enumerate(f):
            pass
    return i+1

def generate_data(list_of_files, percent_files_used=1):
    tagged_sentences = []
    for name in list_of_files:
        abs_path = name
        print(abs_path)
        cur_line = []
        num_files = int(file_len(abs_path) * percent_files_used)
        with open(abs_path, encoding='utf8') as f:
            for index, line in enumerate(f):
                if index < num_files:
                    line = line.split()
                    if not line or line[0] == "#":
                        if cur_line:
                            tagged_sentences.append(cur_line)
                            cur_line = []
                    else:
                        cur_line.append((line[1], line[3]))
    return tagged_sentences
def give_me_the_money():
    minscores = [2]
    maxrules = [500]
    minacc = [0.99]
    cutoffs = [0.1]
    for minscore in minscores:
        for rule in maxrules:
            for acc in minacc:
                for cutoff_p in cutoffs:
                    cutoff = int(len(tagged_sentences_train) * cutoff_p )
                    Template._cleartemplates()
                    baseline_tagger = UnigramTagger(train=tagged_sentences_train[:cutoff], backoff=backoff)
                    templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
                    tt = BrillTaggerTrainer(baseline_tagger, templates=templates, trace=3)
                    tagger1 = tt.train(tagged_sentences_train[cutoff:], max_rules=rule, min_acc=acc, min_score=minscore)

                    print(
                    'params are: max_rules = ' + str(rule) +
                    " ,min_acc = " + str(acc) +
                    " ,min score = " + str(minscore) +
                    " ,cutoff = " + cutoff_p +
                    " ,number of files trained = " + str(len(tagged_sentences_train)) +
                    " ,number of files tested = " + str(len(tagged_sentences_valid))
                          )

                    print(tagger1.evaluate(tagged_sentences_valid))


In [None]:
# Baseline Implementation for Task 1 of Paper 11 - Part-of-Speech(POS) Tagging

#Define file locations
list_of_files_train = glob(base_dir + '/ud-treebanks-v1.1/Training_Data/*')
list_of_files_test = glob(base_dir + '/ud-treebanks-v1.1/Test_Data/*')

tagged_sentences_train = generate_data(list_of_files_train, percent_files_used=1)
tagged_sentences_test = generate_data(list_of_files_test)



N = [200]
for n in N:
    model = tnt.TnT(N = n)
    model.train(tagged_sentences_train)
    test = model.evaluate(tagged_sentences_test)
    sp_un = s.unknown / (s.known + s.unknown)
    sp_kn = s.known / (s.known + s.unknown)
    print("N = ", n)
    print("Accuracy: ", test)
    print('Percentage known:', sp_kn)
    print('Percentage unknown:', sp_un)
    print('Accuracy over known words:', (sacc / sp_kn))


D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\bg-ud-train.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\cs-ud-train-c.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\cs-ud-train-l.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\cs-ud-train-m.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\cs-ud-train-v.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\da-ud-train.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\de-ud-train.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\el-ud-train.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_Project/ud-treebanks-v1.1/Training_Data\en-ud-train.conllu
D:\My Doc\Homework\2018 Winter\COMP 551\Final_P

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
give_me_the_money()