In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

import ucto
import pickle

In [2]:
from sklearn.metrics import confusion_matrix, accuracy_score
import itertools

import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




In [3]:
# Here you can enter the parts that you want to run. The identifiers are the run names, and you can find them further
# in the notebook. Use "all" to run all of them; this option overrules all other run names. Also note that it takes a
# seriously large amount of time to run them all.

_run = set([])
_use_subset = False
_show_graphics = True

def do_run(name):
    return name in _run or "all" in _run

_run_stats = {}

def register_run(name, description, params):
    if name not in _run_stats:
        _run_stats[name] = {}
        _run_stats[name]['description'] = description
        _run_stats[name]['params'] = {}
        for param in params:
            _run_stats[name]['params'][param] = ''
        
        _run_stats[name]['best'] = 0
        _run_stats[name]['best_predictions'] = []
        _run_stats[name]['orig_predictions'] = []
        _run_stats[name]['history'] = []

def update_stats(name, y_test, y_pred, params):
    accuracy = accuracy_score(y_test, y_pred)
    _run_stats[name]['history'].append(accuracy)
    if accuracy > _run_stats[name]['best']:
        _run_stats[name]['best'] = accuracy
        for param in params:
            _run_stats[name]['params'][param] = params[param]
        _run_stats[name]['best_predictions'] = y_pred
        _run_stats[name]['orig_predictions'] = y_test
        
def summarise_run(name):
    if name not in _run_stats:
        print("Name %s is not registrered." % (name))
        return
    
    print("Model:    \n\t%s" % (name))
    print("Settings: \n\t%s" % (_run_stats[name]['params']))
    print("Accuracy: \n\t%s" % (_run_stats[name]['best']))
    
    cm = confusion_matrix(_run_stats[name]['orig_predictions'], _run_stats[name]['best_predictions'])
    
    if _show_graphics:
        plt.figure()
        plot_confusion_matrix(cm, classes=["BEL", "DUT"], title=name + " " + str(_run_stats[name]['params']))
        plt.show()
    else:
        print(cm)       

def summarise_all():
    for name in _run_stats.keys():
        summarise_run(name)
        print()
        
        
import sys
if __name__ == '__main__' and '__file__' in globals():
    # running in console
    _show_graphics = False
    from tqdm import tqdm as tqdm, trange as tnrange
    
    for v in sys.argv:
        if v.startswith("r:"):
            _run.add(v[2:])
        elif v == "o:subset":
            _use_subset = True
else:
    # running in notebook
    from tqdm import tqdm_notebook as tqdm, tnrange as tnrange
    
    _use_subset = True
    pass

In [4]:
def file_len(file):
    with open(file) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [5]:
ucto_config = "tokconfig-nld"
tokeniser = ucto.Tokenizer(ucto_config, sentenceperlineinput=True, sentencedetection=False, paragraphdetection=False)

# We read the file with ucto and tokenise it according to its default Dutch tokenisation scheme, which is rule-based
# and definitely better than a plain whitespace tokeniser from sklearn. Afterwards we concatenate the tokens back to a 
# whitespace seperated line, which can then be normally processed with sklearn's tokenisers.
def read_data(file):
    text = {}
    with open(file) as f:
        for line in tqdm(f):
            sentence, language = line.strip().split("\t")
            tokeniser.process(sentence)

            if language not in text:
                text[language] = []

            current_line = []
            for token in tokeniser:
                current_line.append(str(token))
                if token.isendofsentence():
                    #print(current_line)
                    text[language].append(" ".join(current_line))
                    current_line = []
    return text

In [6]:
# If this is the first run, then we have to tokenise the text. In other cases we probably have saved a pickled version
# somewhere. If not, we will tokenise the text anyway. No worries.

# First the development set
try:
    with open('data/dev.txt.pickle', 'rb') as f:
        _l_dev_text = pickle.load(f)
        print("Done reading development set from pickle.")
except IOError:
    _l_dev_text = read_data('data/dev.txt')
    print("Done tokenising development set.")
    with open('data/dev.txt.pickle', 'wb') as f:
        pickle.dump(_l_dev_text, f, pickle.HIGHEST_PROTOCOL)
    print("Done writing development set from pickle.")

print("development set")
print("\t LAN\t size \t avg length")
for l in _l_dev_text.keys():
    print("\t", l, "\t", len(_l_dev_text[l]), "\t", sum([len(x.split()) for x in _l_dev_text[l]])/len(_l_dev_text[l]))

# And then the training set. This takes bit more time...
try:
    with open('data/train.txt.pickle', 'rb') as f:
        _l_trn_text = pickle.load(f)
        print("Done reading training set from pickle.")
except IOError:
    _l_trn_text = read_data('data/train.txt')
    print("Done tokenising training set.")
    with open('data/train.txt.pickle', 'wb') as f:
        pickle.dump(_l_trn_text, f, pickle.HIGHEST_PROTOCOL)
    print("Done writing training set from pickle.")

print("training set")
print("\t LAN\t size \t avg length")
for l in _l_trn_text.keys():
    print("\t", l, "\t", len(_l_trn_text[l]), "\t", sum([len(x.split()) for x in _l_trn_text[l]])/len(_l_trn_text[l]))
    

Done reading development set from pickle.
development set
	 LAN	 size 	 avg length
	 BEL 	 250 	 40.456
	 DUT 	 250 	 40.088
Done reading training set from pickle.
training set
	 LAN	 size 	 avg length
	 BEL 	 150000 	 40.273626666666665
	 DUT 	 150000 	 40.37152


In [7]:

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC

In [8]:
# Here we convert the training and development material into the right shape, and make sure that we also keep track of
# the labels.

X_training = []
y_training = []
for l in _l_trn_text.keys():
    for s in _l_trn_text[l]:
        X_training.append(s)
        y_training.append(l)
X_training = np.array(X_training)
y_training = np.array(y_training)


X_dev = []
y_dev = []
for l in _l_dev_text.keys():
    for s in _l_dev_text[l]:
        X_dev.append(s)
        y_dev.append(l)
X_dev = np.array(X_dev)
y_dev = np.array(y_dev)

In [9]:
# Sometimes for testing whether some code words, you might want to use a subset. Use this one. Or another one. I don't
# care. 

import random
#use = random.sample(range(1, 299999), 100000)
use = random.sample(range(1, 299999), 10000)

if _use_subset:   
    X_training = X_training[use]
    y_training = y_training[use]
    
    print("training subset")
    #print("\t LAN\t size \t avg length")
    #for l in _l_trn_text.keys():
    #    print("\t", l, "\t", len(_l_trn_text[l]), "\t", sum([len(x.split()) for x in _l_trn_text[l]])/len(_l_trn_text[l]))

training subset


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

if do_run("svc2"):

    register_run("svc2",
                 "character and word n-grams, with count feature vectors, used in a linear support vector classifier setting",
                 ['min_cn', 'max_cn', 'min_n', 'max_n'])

    
    for min_cn in tnrange(1,8, desc="min char ngram"):
        for max_cn in tnrange(min_cn, 8, desc="max char ngram"):
            
            for min_n in tnrange(1,6, desc="min word ngram"):
                for max_n in tnrange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', CountVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('svc', SVC(kernel='linear')),
                    ])

                    model = pipeline.fit(X_training, y_training)
                    y_pred = model.predict(X_dev)

                    update_stats("svc2", y_dev, y_pred, {'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})
    #summarise_run("svc2", y_dev, prediction)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

if do_run("svc1"):

    register_run("svc1",
                 "character and word n-grams, with tf-idf feature vectors, used in a linear support vector classifier setting",
                 ['min_cn', 'max_cn', 'min_n', 'max_n'])


    for min_cn in tnrange(1,8, desc="min char ngram"):
        for max_cn in tnrange(min_cn, 8, desc="max char ngram"):
            
            for min_n in tnrange(1,6, desc="min word ngram"):
                for max_n in tnrange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('svc', SVC(kernel='linear')),
                    ])

                    prediction = pipeline.fit(X_training, y_training)
                    score = prediction.score(X_dev, y_dev)

                    model = pipeline.fit(X_training, y_training)
                    y_pred = model.predict(X_dev)

                    update_stats("svc1", y_dev, y_pred, {'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

if do_run("mnb1"):
    
    register_run("mnb1",
             "character and word n-grams, with tf-idf feature vectors, used in a multinominal naive bayes setting",
             ['min_cn', 'max_cn', 'min_n', 'max_n'])
    
    for min_cn in tnrange(1,8, desc="min char ngram"):
        for max_cn in tnrange(min_cn, 8, desc="max char ngram"):
            
            for min_n in tnrange(1,6, desc="min word ngram"):
                for max_n in tnrange(min_n,6, desc="max word ngram"):
                    
                    steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                             ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                            ]

                    union = FeatureUnion(steps)

                    pipeline = Pipeline([
                        ('union', union),
                        ('mnb', MultinomialNB()),
                    ])

                    model = pipeline.fit(X_training, y_training)
                    y_pred = model.predict(X_dev)

                    update_stats("mnb1", y_dev, y_pred, {'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})


In [13]:
from sklearn.neighbors import KNeighborsClassifier

if do_run("knn1"):
    
    register_run("knn1",
             "character and word n-grams, with tf-idf feature vectors, used in a k nearest neighbours setting",
             ['neighbours', 'min_cn', 'max_cn', 'min_n', 'max_n'])
    
    for neighbours in tnrange(1,7, desc="neighbours"):

        for min_cn in tnrange(1,8, desc="min char ngram"):
            for max_cn in tnrange(min_cn, 8, desc="max char ngram"):

                for min_n in tnrange(1,6, desc="min word ngram"):
                    for max_n in tnrange(min_n,6, desc="max word ngram"):

                        steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                                 ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                                ]

                        union = FeatureUnion(steps)

                        pipeline = Pipeline([
                            ('union', union),
                            ('mnb', KNeighborsClassifier(n_neighbors=neighbours)),
                        ])

                        model = pipeline.fit(X_training, y_training)
                        y_pred = model.predict(X_dev)

                        update_stats("knn1", y_dev, y_pred, {'neighbours': neighbours, 'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})


In [14]:
# fasttext

In [15]:
import fasttext

if do_run("fat1"):
    
    register_run("fat1",
             "character and word n-grams, with embeddings, used in a fasttext (w2v sg) setting",
             [])

    with open('fasttext.train.txt', 'w') as f:
        for line, label in zip(X_training, y_training):
            f.write(line + " __language__" + label + "\n")

    ft_classifier = fasttext.supervised('fasttext.train.txt', 'model', 
                                        min_count=1, 
                                        word_ngrams=3, 
                                        minn=7, 
                                        maxn=7, 
                                        thread=2, 
                                        label_prefix='__language__')
    ft_predictions = ft_classifier.predict(X_dev)
    
    update_stats("fat1", y_dev, ft_predictions, {})

In [16]:
# This is a new vectoriser based on the loglikelihood values as computed by colibricore-loglikelihood.
# Based on some thresholds (n-gram occurrence >= 2, 1 <= n <= 3 -grams), it computes the occurrence counts,
# their frequency and the corresponding loglikelihood scores.
# These scores are sorted, and then this vectoriser takes the top m patterns, and marks whether that pattern
# is presented in the given set.

from sklearn.base import BaseEstimator, TransformerMixin

class LLHbasedBinaryVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, count=1000):
        self.llh_1000 = []
        with open('data/DUT_BEL.t2m1l3.llh.top1000', 'r') as f:
            for n, line in enumerate(f):
                self.llh_1000.append(line.split("\t")[0])
                if n >= count:
                    break
    
    def llh_binary_countvectorizer(self, line):
        values = []
        for k in self.llh_1000:
            values.append(1*(k in line))
        return values
    
    def transform(self, df, y=None):
        result = []
        for l in df:
            result.append(self.llh_binary_countvectorizer(l))
        return result
    
    def fit(self, df, y=None):
        return self

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

if do_run("svc3"):
    
    register_run("svc3",
             "binary loglikelihood-based vectors on word n-grams, used in a linear support vector classifier",
             ['count'])
    
    for count in tqdm(range(1, 10000, 500), desc="# llh counts"):

        steps = [('llh', LLHbasedBinaryVectorizer(count=count)),
                 #('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                ]

        union = FeatureUnion(steps)

        pipeline = Pipeline([
            ('union', union),
            ('svc', SVC(kernel='linear')),
        ])

        model = pipeline.fit(X_training, y_training)
        y_pred = model.predict(X_dev)

        update_stats("svc3", y_dev, y_pred, {'count': count})

            
       

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

if do_run("mnb2"):
    
    register_run("mnb2",
             "binary loglikelihood-based vectors on word n-grams, used in a multinominal naive bayes setting",
             ['count'])
    
    for count in tqdm(range(1, 10000, 500), desc="# llh counts"):

        steps = [('llh', LLHbasedBinaryVectorizer(count=count)),
                 #('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                ]

        union = FeatureUnion(steps)

        pipeline = Pipeline([
            ('union', union),
            ('mnb', MultinomialNB()),
        ])

        model = pipeline.fit(X_training, y_training)
        y_pred = model.predict(X_dev)

        update_stats("mnb2", y_dev, y_pred, {'count': count})



In [19]:
from sklearn.feature_extraction.text import CountVectorizer

if do_run("knn2"):
    
    register_run("knn2",
             "binary loglikelihood-based vectors on word n-grams, used in a k nearest neighbours setting",
             ['neighbours', 'count'])
    
    for neighbours in tnrange(1,7, desc="neighbours"):
        for count in tqdm(range(1, 10000, 500), desc="# llh counts"):

            steps = [('llh', LLHbasedBinaryVectorizer(count=count)),
                     #('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                    ]

            union = FeatureUnion(steps)

            pipeline = Pipeline([
                ('union', union),
                ('knn', KNeighborsClassifier(n_neighbors=neighbours)),
            ])

            model = pipeline.fit(X_training, y_training)
            y_pred = model.predict(X_dev)

            update_stats("knn2", y_dev, y_pred, {'neighbours': neighbours, 'count': count})

        


In [20]:
from sklearn.ensemble import RandomForestClassifier

if do_run("rfo1"):
    
    register_run("rfo1",
             "binary loglikelihood-based vectors on word n-grams, used in a random forest setting",
             ['count'])
    
    for count in tqdm(range(1, 10000, 500), desc="# llh counts"):

        steps = [('llh', LLHbasedBinaryVectorizer(count=count)),
                 #('words', CountVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                ]

        union = FeatureUnion(steps)

        pipeline = Pipeline([
            ('union', union),
            ('rfo', RandomForestClassifier()),
        ])

        model = pipeline.fit(X_training, y_training)
        y_pred = model.predict(X_dev)

        update_stats("rfo1", y_dev, y_pred, {'count': count})

        


# mlp
mlp is sensitive to feature scaling

In [21]:
from sklearn.neural_network import MLPClassifier

if do_run("mlp1"):
    register_run("mlp1",
             "character and word n-grams, with tf-idf feature vectors, used in a multilayer perceptron (adam) setting",
             ['alpha', 'hls', 'min_cn', 'max_cn', 'min_n', 'max_n'])
    
    for alpha in tqdm(10.0 ** -np.arange(1, 7), desc="alpha"):
        for hls in tqdm([(5,2), (5,5), (10,2), (10,5), (50,2), (50,5), (50,10)], desc="hls"):

            for min_cn in tnrange(1,8, desc="min char ngram"):
                for max_cn in tnrange(min_cn, 8, desc="max char ngram"):

                    for min_n in tnrange(1,6, desc="min word ngram"):
                        for max_n in tnrange(min_n,6, desc="max word ngram"):
                            
                            steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                                     ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                                    ]

                            union = FeatureUnion(steps)

                            pipeline = Pipeline([
                                ('union', union),
                                ('mlp1', MLPClassifier(solver='adam', alpha=alpha, hidden_layer_sizes=hls, random_state=1)),
                            ])

                            model = pipeline.fit(X_training, y_training)
                            y_pred = model.predict(X_dev)

                            update_stats("mlp1", y_dev, y_pred, {'alpha': alpha, 'hls': hls, 'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})


In [22]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

if do_run("xgb1"):
    register_run("xgb1",
             "character and word n-grams, with tf-idf feature vectors, used in a extreme gradient boost setting",
             [])
    
    xgb_model = xgb.XGBClassifier()

    parameters = {'nthread':[1], #when use hyperthread, xgboost may become slower
                  'objective':['binary:logistic'],
                  'learning_rate': [0.05], #so called `eta` value
                  'max_depth': [6],
                  'min_child_weight': [11],
                  'silent': [1],
                  'subsample': [0.8],
                  'colsample_bytree': [0.7],
                  'n_estimators': [1000], #number of trees, change it to 1000 for better results
                  'missing':[-999],
                  'seed': [1337]}

    from sklearn.cross_validation import *
    clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                       cv=StratifiedKFold(y_training[use], n_folds=5, shuffle=True), 
                       scoring='accuracy',
                       verbose=2, refit=True)

    #bst = clf.fit(X_training, y_training)
    #xgb_model = xgb.XGBClassifier(nthread=1, objective='binary:logistic', learning_rate=0.05, max_depth=6, min_child_weight=11, missing=-999, n_estimators=1000, subsample=0.8, colsample_bytree=0.7)

    #xgb.plot_importance(xgb_model)
    #plt.show()
    
    #update_stats("xgb1", y_dev, y_pred, {})

In [23]:
import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC

class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = [] # fuggly

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        # flugglyness
        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self

    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
        
    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)

In [24]:
from sklearn.neural_network import MLPClassifier

if do_run("nbs1"):
    register_run("nbs1",
             "character and word n-grams, with tf-idf feature vectors, used in a naive bayes/svm setting",
             ['C', 'min_cn', 'max_cn', 'min_n', 'max_n'])
    
    for C in tqdm([0.01, 0.1, 1.0], desc="C"):

        for min_cn in tnrange(1,8, desc="min char ngram"):
            for max_cn in tnrange(min_cn, 8, desc="max char ngram"):

                for min_n in tnrange(1,6, desc="min word ngram"):
                    for max_n in tnrange(min_n,6, desc="max word ngram"):

                        steps = [('char', TfidfVectorizer(analyzer='char', ngram_range=(min_cn,max_cn))),
                                 ('words', TfidfVectorizer(analyzer='word', ngram_range=(min_n,max_n),token_pattern=u"(?u)\\b\\w+\\b"))
                                ]

                        union = FeatureUnion(steps)

                        pipeline = Pipeline([
                            ('union', union),
                            ('nbs', NBSVM(C=C)),
                        ])

                        model = pipeline.fit(X_training, y_training)
                        y_pred = model.predict(X_dev)

                        update_stats("nbs1", y_dev, y_pred, {'C': C, 'min_cn': min_cn, 'max_cn': max_cn, 'min_n': min_n, 'max_n': max_n})


In [25]:
summarise_all()