# Assignment 2

Date: 29-10-2020 <br>
Nick Radunovic (s2072724) <br>
Cheyenne Heath (s1647865) <br>

Tasks:
1. Download W-NUT_data.zip from the Brightspace assignment and unzip the directory. It
contains 3 IOB files: wnut17train.conll (train), emerging.dev.conll (dev),
emerging.test.annotated (test)
2. The IOB files do not contain POS tags yet. Add a function to your CRFsuite script that reads
the IOB files and adds POS tags (using an existing package for linguistic processing such as
Spacy or NLTK). The data needs to be stored in the same way as the benchmark data from
the tutorial (an array of triples (word,pos,biotag)).
3. Run a baseline run (train -> test) with the features directly copied from the tutorial.
4. Set up hyperparameter optimization using the dev set and evaluate the result on the test set.
5. Extend the features: add a larger context (-2 .. +2 or more) and engineer a few other features
that might be relevant for this task. Have a look at the train/dev data to get inspiration on
potentially relevant papers.
6. Experiment with the effect of different feature sets on the quality of the labelling.

In [None]:
#Imports
import random
random.seed(30) # set random seed for reproducibility

import numpy as np
from itertools import chain
from collections import Counter
import eli5

import nltk
nltk.download('averaged_perceptron_tagger')
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Function to read in the data per sentence. 

In [None]:
def parse_data(file):    
    sents = []
    with open(file, encoding='utf-8') as fp:
        new_sent = []
        for line in fp:
            if (line == '\n') or (line == '\t\n'):
                #new line so end of sentence, append new_sent to sents array and clear the new_sent
                sents.append(new_sent)
                new_sent = []
            else:
                #create tuple and add to sentence
                new_line = line.strip()
                new_sent.append(tuple(new_line.split('\t')))
    return sents

In [None]:
#parse all files
train_sents = parse_data('wnut17train.conll')
dev_sents = parse_data('emerging.dev.conll')
test_sents = parse_data('emerging.test.annotated')

Creating a function that transforms data of the form (word,pos) to the form (word,pos,biotag) for the words of each sentence.

In [None]:
def add_POS_tag(word_pos, sent):
    new = []
    for i in range(len(word_pos)):
        l = list(word_pos[i])
        l.append(sent[i][1])
        l = tuple(l)
        new.append(l)
    return new

We now add POS tags to the words of each sentence, storing the data in the format: (word,pos,biotag).
Note, that the function pos_tag of nltk get the whole sentence as input and adds POS tags to each wordt based on both the word and the context that the word is in.

In [None]:
#add pos tag to each dataset, can take a few minutes
train_sents = [add_POS_tag(nltk.pos_tag([word[0] for word in sentence]), sentence) for sentence in train_sents]
dev_sents = [add_POS_tag(nltk.pos_tag([word[0] for word in sentence]), sentence) for sentence in dev_sents]
test_sents = [add_POS_tag(nltk.pos_tag([word[0] for word in sentence]), sentence) for sentence in test_sents]

3. Run a baseline run (train -> test) with the features directly copied from the tutorial.

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels

In [None]:
y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Set up hyperparameter optimization using the dev set and evaluate the result on the test set.

In [None]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))

#### Extend the features and perform some experiments

This is the extended word2feature function that encompasses both a bigger range (-3 to +3) and a new feature: 'word.starts_with_uppercase'.

In [None]:
def word2features_extended(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'word.starts_with_uppercase': word[:1].isupper(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 2:
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:word.isdigit()': word3.isdigit(),
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
        })
    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.isdigit()': word2.isdigit(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'word.starts_with_uppercase': word1[:1].isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-3:
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:word.isdigit()': word3.isdigit(),
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
        })
    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.isdigit()': word2.isdigit(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features_extended(sent):
    return [word2features_extended(sent, i) for i in range(len(sent))]

In [None]:
X_dev = [sent2features_extended(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features_extended(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_dev, y_dev)

y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print("weighted avg:")
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))

#Include gridsearch on dev set
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=10, 
                        scoring=f1_scorer)
rs.fit(X_dev, y_dev)

print('best params:', rs.best_params_)
print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))
    
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

**Try featureset 3:**

Adding:
- range -2 .. + 2
- prescence in list of common names in America (source: https://www.ssa.gov/oact/babynames/limits.html)
- is hypen
- is '@'



In [None]:
def word2features_exp3(sent, i, names):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'in_names':(word in names.tolist()),
        'is_hypen': ('@' in word),
        'is_at':('-' in word),
    }
    
    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.isdigit()': word2.isdigit(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-2:in_names':(word2 in names.tolist()),
            '-2:is_hypen': ('@' in word2),
            '-2:is_at':('-' in word2),
        })
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:in_names':(word1 in names.tolist()),
            '-1:is_hypen': ('@' in word1),
            '-1:is_at':('-' in word1),
        })
    else:
        features['BOS'] = True

   
    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.isdigit()': word2.isdigit(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+2:in_names':(word2 in names.tolist()),
            '+2:is_hypen': ('@' in word2),
            '+2:is_at':('-' in word2),
        })
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:in_names':(word1 in names.tolist()),
            '+1:is_hypen': ('@' in word1),
            '+1:is_at':('-' in word1),
        })
    else:
        features['EOS'] = True

    return features

def sent2features_exp3(sent):
    names = np.genfromtxt('yob2010.txt', delimiter=",", usecols = (0), dtype='unicode')
    return [word2features_exp3(sent, i, names) for i in range(len(sent))]

In [None]:
X_dev = [sent2features_exp3(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features_exp3(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_dev, y_dev)

y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print('experiment with feature set 3')
print("weighted avg:")
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))
    
#Including gridsearch
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=10, 
                        scoring=f1_scorer)
rs.fit(X_dev, y_dev)

print('best params:', rs.best_params_)
print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))
    
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Try featureset 4:
Same as above with with 3 words, but with lower added back in

In [None]:
def word2features_exp3(sent, i, names):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word[-2:]': word[-2:],
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'in_names':(word in names.tolist()),
        'is_hypen': ('@' in word),
        'is_at':('-' in word),
    }
    if i > 2:
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:word.isdigit()': word3.isdigit(),
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
            '-3:is_hypen': ('@' in word3),
            '-3:is_at':('-' in word3),
        })
    
    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.isdigit()': word2.isdigit(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-2:in_names':(word2 in names.tolist()),
            '-2:is_hypen': ('@' in word2),
            '-2:is_at':('-' in word2),
        })
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:in_names':(word1 in names.tolist()),
            '-1:is_hypen': ('@' in word1),
            '-1:is_at':('-' in word1),
        })
    else:
        features['BOS'] = True
    
    if i < len(sent)-3:
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:word.isdigit()': word3.isdigit(),
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
            '+3:is_hypen': ('@' in word3),
            '+3:is_at':('-' in word3),
        })

   
    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.isdigit()': word2.isdigit(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+2:in_names':(word2 in names.tolist()),
            '+2:is_hypen': ('@' in word2),
            '+2:is_at':('-' in word2),
        })
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:in_names':(word1 in names.tolist()),
            '+1:is_hypen': ('@' in word1),
            '+1:is_at':('-' in word1),
        })
    else:
        features['EOS'] = True

    return features

def sent2features_exp3(sent):
    names = np.genfromtxt('yob2010.txt', delimiter=",", usecols = (0), dtype='unicode')
    return [word2features_exp3(sent, i, names) for i in range(len(sent))]

X_dev = [sent2features_exp3(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features_exp3(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_dev, y_dev)

y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print('experiment with feature set 4')
print("weighted avg:")
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))


#Including gridsearch
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=10, 
                        scoring=f1_scorer)
rs.fit(X_dev, y_dev)

print('best params:', rs.best_params_)
print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))
    
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Because we see that the baseline model does not see the difference between B- and I- categories very well, we lookad at the data to see if we could find any patterns. We saw that the use of prepositions and articles is very common for to appear near the B- categorie, so we might think that that would have an influence.

- prepositions for place and time like: at, on & in
- articles  a, an & the

To check ik a words is a verb or a noun with the help of nltk.corpus wordnet:
is_verb = (wn.synsets(word)[0].pos() == 'v')
is_noun = (wn.synsets(word)[0].pos() == 'n')


In [None]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
prepos = ['at', 'on', 'in']
articles = ['a','an','the']

def word2features_exp3(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'is_prepos': (word in prepos),
        'is_atricle': (word in articles),
        #'is_verb':(wn.synsets(word)[0].pos() == 'v'),
        #'is_noun':(wn.synsets(word)[0].pos() == 'n'),

        
    }
    if i > 2:
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:word.isdigit()': word3.isdigit(),
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
            '-3:is_prepos': (word3 in prepos),
            '-3:is_atricle': (word3 in articles),
            #'-3:is_verb':(wn.synsets(word3)[0].pos() == 'v'),
            #'-3:is_noun':(wn.synsets(word3)[0].pos() == 'n'),
        })
    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.isdigit()': word2.isdigit(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-2:is_prepos': (word2 in prepos),
            '-2:is_atricle': (word2 in articles),
            #'-2:is_verb':(wn.synsets(word2)[0].pos() == 'v'),
            #'-2:is_noun':(wn.synsets(word2)[0].pos() == 'n'),
        })
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'word.starts_with_uppercase': word1[:1].isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:is_prepos': (word1 in prepos),
            '-1:is_atricle': (word1 in articles),
            #'-1:is_verb':(wn.synsets(word1)[0].pos() == 'v'),
            #'-1:is_noun':(wn.synsets(word1)[0].pos() == 'n'),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-3:
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:word.isdigit()': word3.isdigit(),
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
            '+3:is_prepos': (word3 in prepos),
            '+3:is_atricle': (word3 in articles),
            #'+3:is_verb':(wn.synsets(word3)[0].pos() == 'v'),
            #'+3:is_noun':(wn.synsets(word3)[0].pos() == 'n'),
        })
    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.isdigit()': word2.isdigit(),
            '+2:postag': postag2,
            '+2:is_prepos': (word2 in prepos),
            '+2:postag[:2]': postag2[:2],
            '+2:is_atricle': (word2 in articles),
            #'+2:is_verb':(wn.synsets(word2)[0].pos() == 'v'),
            #'+2:is_noun':(wn.synsets(word2)[0].pos() == 'n'),
        })
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:is_prepos': (word1 in prepos),
            '+1:is_atricle': (word1 in articles),
            #'+1:is_verb':(wn.synsets(word1)[0].pos() == 'v'),
            #'+1:is_noun':(wn.synsets(word1)[0].pos() == 'n'),
        })
    else:
        features['EOS'] = True

    return features

def sent2features_exp3(sent):
    names = np.genfromtxt('yob2010.txt', delimiter=",", usecols = (0), dtype='unicode')
    return [word2features_exp3(sent, i) for i in range(len(sent))]

X_dev = [sent2features_exp3(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features_exp3(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_dev, y_dev)

y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print('experiment with feature set 4')
print("weighted avg:")
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))


#Including gridsearch
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=10, 
                        scoring=f1_scorer)
rs.fit(X_dev, y_dev)

print('best params:', rs.best_params_)
print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))
    
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))