In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

In [2]:
import ucto
import pickle
from tqdm import tqdm, trange

In [3]:
# Here you can enter the parts that you want to run. The identifiers are the run names, and you can find them further
# in the notebook. Use "all" to run all of them; this option overrules all other run names. Also note that it takes a
# seriously large amount of time to run them all.

#_run = ["all"]
_run = []

def do_run(name):
    return name in _run or "all" in _run

_run_max = {}

def update_run(name, value, settings):
    if _run_max.get(name, [0, ""]) < value:
        _run_max[name] = [value, settings]
        print("[%s]\t%s\t%s" % (name, value, settings))

In [4]:
def file_len(file):
    with open(file) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [5]:
ucto_config = "tokconfig-nld"
tokeniser = ucto.Tokenizer(ucto_config, sentenceperlineinput=True, sentencedetection=False, paragraphdetection=False)

# We read the file with ucto and tokenise it according to its default Dutch tokenisation scheme, which is rule-based
# and definitely better than a plain whitespace tokeniser from sklearn. Afterwards we concatenate the tokens back to a 
# whitespace seperated line, which can then be normally processed with sklearn's tokenisers.
def read_data(file):
    text = {}
    with open(file) as f:
        for line in tqdm(f):
            sentence, language = line.strip().split("\t")
            tokeniser.process(sentence)

            if language not in text:
                text[language] = []

            current_line = []
            for token in tokeniser:
                current_line.append(str(token))
                if token.isendofsentence():
                    #print(current_line)
                    text[language].append(" ".join(current_line))
                    current_line = []
    return text

In [6]:
# If this is the first run, then we have to tokenise the text. In other cases we probably have saved a pickled version
# somewhere. If not, we will tokenise the text anyway. No worries.

# First the development set
try:
    with open('data/dev.txt.pickle', 'rb') as f:
        _l_dev_text = pickle.load(f)
        print("Done reading development set from pickle.")
except IOError:
    _l_dev_text = read_data('data/dev.txt')
    print("Done tokenising development set.")
    with open('data/dev.txt.pickle', 'wb') as f:
        pickle.dump(_l_dev_text, f, pickle.HIGHEST_PROTOCOL)
    print("Done writing development set from pickle.")

print("development set")
print("\tLAN\tsize\tavg length")
for l in _l_dev_text.keys():
    print("\t", l, "\t", len(_l_dev_text[l]), "\t", sum([len(x.split()) for x in _l_dev_text[l]])/len(_l_dev_text[l]))

# And then the training set. This takes bit more time...
try:
    with open('data/train.txt.pickle', 'rb') as f:
        _l_trn_text = pickle.load(f)
        print("Done reading training set from pickle.")
except IOError:
    _l_trn_text = read_data('data/train.txt')
    print("Done tokenising training set.")
    with open('data/train.txt.pickle', 'wb') as f:
        pickle.dump(_l_trn_text, f, pickle.HIGHEST_PROTOCOL)
    print("Done writing training set from pickle.")

print("training set")
print("\tLAN\tsize\tavg length")
for l in _l_trn_text.keys():
    print("\t", l, "\t", len(_l_trn_text[l]), "\t", sum([len(x.split()) for x in _l_trn_text[l]])/len(_l_trn_text[l]))
    

Done reading development set from pickle.
development set
	 BEL 	 250 	 40.456
	 DUT 	 250 	 40.088


Done reading training set from pickle.
training set
	 BEL 	 150000 	 40.273626666666665
	 DUT 	 150000 	 40.37152


In [9]:

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC

In [10]:
# Here we convert the training and development material into the right shape, and make sure that we also keep track of
# the labels.

X_training = []
y_training = []
for l in _l_trn_text.keys():
    for s in _l_trn_text[l]:
        X_training.append(s)
        y_training.append(l)
X_training = np.array(X_training)
y_training = np.array(y_training)


X_dev = []
y_dev = []
for l in _l_dev_text.keys():
    for s in _l_dev_text[l]:
        X_dev.append(s)
        y_dev.append(l)
X_dev = np.array(X_dev)
y_dev = np.array(y_dev)

In [11]:
# Sometimes for testing whether some code words, you might want to use a subset. Use this one. Or another one. I don't
# care. 

import random
#use = random.sample(range(1, 299999), 100000)
use = random.sample(range(1, 299999), 2500)

In [12]:
# svc2: character and word n-grams, with count feature vectors, used in a linear support vector classifier setting

from sklearn.feature_extraction.text import CountVectorizer

#svc2_max = [0.578, '1, 2, 3, 5']
if do_run("svc2"):
    
    for min_cn in trange(1,8, desc="min char ngram"):
        for max_cn in trange(min_cn, 8, desc="max char ngram"):
            
            for min_n in trange(1,6, desc="min word ngram"):
                for max_n in trange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', CountVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('svc', SVC(kernel='linear')),
                    ])

                    prediction = pipeline.fit(X_training, y_training)
                    score = prediction.score(X_dev, y_dev)

                    update_run("svc2", score, "%s, %s, %s, %s" % (min_cn, max_cn, min_n, max_n))

In [13]:
# svc1: character and word n-grams, with tf-idf feature vectors, used in a linear support vector classifier setting

from sklearn.feature_extraction.text import TfidfVectorizer

svc1_max = [0.586, '1, 2, 4, 5']
#svc1_max = [0, ""]

if do_run("svc1"):
    
    for min_cn in trange(1,8, desc="min char ngram"):
        for max_cn in trange(min_cn, 8, desc="max char ngram"):
            
            for min_n in trange(1,6, desc="min word ngram"):
                for max_n in trange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('svc', SVC(kernel='linear')),
                    ])

                    prediction = pipeline.fit(X_training[use], y_training[use])
                    score = prediction.score(X_dev, y_dev)

                    update_run("svc1", score, "%s, %s, %s, %s" % (min_cn, max_cn, min_n, max_n))


In [14]:
# mnb1: character and word n-grams, with tf-idf feature vectors, used in a multinominal naive bayes setting

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

mnb1_max = [0.55, '4, 5, 5, 5']
#mnb1_max = [0, ""]

if do_run("mnb1"):
    
    for min_cn in trange(1,8, desc="min char ngram"):
        for max_cn in trange(min_cn, 8, desc="max char ngram"):
            
            for min_n in trange(1,6, desc="min word ngram"):
                for max_n in trange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('mnb', MultinomialNB()),
                    ])

                    prediction = pipeline.fit(X_training[use], y_training[use])
                    score = prediction.score(X_dev, y_dev)

                    update_score("mnb1", score, "%s, %s, %s, %s" % (min_cn, max_cn, min_n, max_n))


In [15]:
# knn1: character and word n-grams, with tf-idf feature vectors, used in a k nearest neighbours setting

from sklearn.neighbors import KNeighborsClassifier

knn1_max = [0.562, '4, 3, 7, 5, 5']

if do_run("knn1"):
    
    for neighbours in trange(1,7, desc="neighbours"):

        for min_cn in trange(1,8, desc="min char ngram"):
            for max_cn in trange(min_cn, 8, desc="max char ngram"):

                for min_n in trange(1,6, desc="min word ngram"):
                    for max_n in trange(min_n,6, desc="max word ngram"):

                        steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                                 ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                                ]

                        union = FeatureUnion(steps)

                        pipeline = Pipeline([
                            ('union', union),
                            ('mnb', KNeighborsClassifier(n_neighbors=neighbours)),
                        ])

                        prediction = pipeline.fit(X_training[use], y_training[use])
                        score = prediction.score(X_dev, y_dev)

                        update_run("knn1", score, "%s, %s, %s, %s, %s" % (neighbours, min_cn, max_cn, min_n, max_n))


In [16]:
# fasttext

In [17]:
# ftt1: character and word n-grams, with embeddings, used in a fasttext (w2v sg) setting

import fasttext

if do_run("fat1"):

    with open('fasttext.train.txt', 'w') as f:
        for line, label in zip(X_training[use], y_training[use]):
            f.write(line + " __language__" + label + "\n")

    ft_classifier = fasttext.supervised('fasttext.train.txt', 'model', 
                                        min_count=1, 
                                        word_ngrams=3, 
                                        minn=7, 
                                        maxn=7, 
                                        thread=2, 
                                        label_prefix='__language__')
    ft_predictions = ft_classifier.predict(X_dev)

# lm 
which patterns are more important compared to the "all" background corpus?

In [18]:
# naive bayes

In [19]:
# llh

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class LLHbasedBinaryVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, count=1000):
        self.llh_1000 = []
        with open('data/DUT_BEL.t2m1l3.llh.top1000', 'r') as f:
            for n, line in enumerate(f):
                self.llh_1000.append(line.split("\t")[0])
                if n >= count:
                    break
    
    def llh_binary_countvectorizer(self, line):
        values = []
        for k in self.llh_1000:
            values.append(1*(k in line))
        return values
    
    def transform(self, df, y=None):
        result = []
        for l in df:
            result.append(self.llh_binary_countvectorizer(l))
        return result
    
    def fit(self, df, y=None):
        return self

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

svc3_max = [0, ""]

if do_run("svc3"):
    for count in tqdm(range(1, 10000, 500), desc="# llh counts"):

        steps = [('llh', LLHbasedBinaryVectorizer(count=count)),
                 #('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                ]

        union = FeatureUnion(steps)

        pipeline = Pipeline([
            ('union', union),
            ('svc', SVC(kernel='linear')),
        ])

        prediction = pipeline.fit(X_training[use], y_training[use])
        score = prediction.score(X_dev, y_dev)

        update_run("svc3", score, str(count))
            
       

# mlp
mlp is sensitive to feature scaling

In [22]:
# mlp1: character and word n-grams, with tf-idf feature vectors, used in a multilayer perceptron (adam) setting

from sklearn.neural_network import MLPClassifier

mlp1_max = [0, ""]

if do_run("mlp1"):
    for alpha in tqdm(10.0 ** -np.arange(1, 7), desc="alpha"):
        for hls in tqdm([(5,2), (5,5), (10,2), (10,5), (50,2), (50,5), (50,10)], desc="hls"):

            for min_cn in trange(1,8, desc="min char ngram"):
                for max_cn in trange(min_cn, 8, desc="max char ngram"):

                    for min_n in trange(1,6, desc="min word ngram"):
                        for max_n in trange(min_n,6, desc="max word ngram"):
                            
                            steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                                     ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                                    ]

                            union = FeatureUnion(steps)

                            pipeline = Pipeline([
                                ('union', union),
                                ('mlp1', MLPClassifier(solver='adam', alpha=alpha, hidden_layer_sizes=hls, random_state=1)),
                            ])

                            prediction = pipeline.fit(X_training[use], y_training[use])
                            score = prediction.score(X_dev, y_dev)

                            update_run("mlp1", score, "%s, %s, %s, %s, %s, %s" % (alpha, hls, min_cn, max_cn, min_n, max_n))


In [24]:
# xgb1: character and word n-grams, with tf-idf feature vectors, used in a extreme gradient boost setting

import xgboost as xgb

if do_run("xgb1"):
    xgb_model = xgb.XGBClassifier()

    parameters = {'nthread':[1], #when use hyperthread, xgboost may become slower
                  'objective':['binary:logistic'],
                  'learning_rate': [0.05], #so called `eta` value
                  'max_depth': [6],
                  'min_child_weight': [11],
                  'silent': [1],
                  'subsample': [0.8],
                  'colsample_bytree': [0.7],
                  'n_estimators': [1000], #number of trees, change it to 1000 for better results
                  'missing':[-999],
                  'seed': [1337]}

    from sklearn.cross_validation import *
    clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                       cv=StratifiedKFold(y_training[use], n_folds=5, shuffle=True), 
                       scoring='accuracy',
                       verbose=2, refit=True)

    bst = clf.fit(X_training[use], y_training[use])
    xgb_model = xgb.XGBClassifier(nthread=1, objective='binary:logistic', learning_rate=0.05, max_depth=6, min_child_weight=11, missing=-999, n_estimators=1000, subsample=0.8, colsample_bytree=0.7)

    xgb.plot_importance(xgb_model)
    plt.show()