In [1]:
# import os
# from hw2_corpus_tool import *
# import pycrfsuite
# import random
# import shutil

# train_data_path = './train/train'

In [2]:
# train_data = set(os.listdir(train_data_path))
# test_sample = random.sample(train_data, int(len(train_data)*0.25))
# dev_sample = train_data-set(test_sample)

# for file_name in dev_sample:
#     path = os.path.join(train_data_path, file_name)
#     shutil.copy(path, './dev')
    
# for file_name in test_sample:
#     path = os.path.join(train_data_path, file_name)
#     shutil.copy(path, './test')

In [3]:
import os
from hw2_corpus_tool import *
import pycrfsuite
import random

train_path = './dev'
test_path = './test'

In [4]:
def get_features(utterance, first_utterance, last_speaker):
    features = []
    if(last_speaker and utterance.speaker != last_speaker):
        features.append('SPEAKER_CHANGE')
        
    if(first_utterance):
        features.append('FIRST_UTTERANCE')
    
    if(not utterance.pos):
        return features, utterance.speaker, utterance.act_tag
    
    for token, pos in utterance.pos:
        features.append('TOKEN_'+token)
        features.append('POS_'+pos)
    
    return features, utterance.speaker, utterance.act_tag

In [24]:
def create_dataset(path, advanced=False):
    dataset = get_data(path)
    x_final = []
    y_final = []
    for conversation in dataset:
        x = []
        y = []
        first_utterance = True
        last_speaker = None
        for utterance in conversation:
            if(advanced):
                features, last_speaker, label = get_advanced_features(utterance, first_utterance, last_speaker)
            else:
                features, last_speaker, label = get_features(utterance, first_utterance, last_speaker)
            x.append(features)
            y.append(label)
            if(first_utterance):
                first_utterance = False
        x_final.append(x)
        y_final.append(y)
    return x_final, y_final

# x_train, y_train = create_dataset(train_path)
# x_test, y_test = create_dataset(test_path)

In [6]:
def train(x_train, y_train):
    trainer = pycrfsuite.Trainer(verbose=False)

    for (x, y) in zip(x_train, y_train):
        trainer.append(x, y)

    trainer.set_params({
        'c1': 1.0, # coefficient for L1 penalty
        'c2': 1e-3, # coefficient for L2 penalty
        'max_iterations': 50, # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train('baseline_tagger.crfsuite')
    return 

train(x_train, y_train)

In [7]:
def predict(x_test):
    y_pred = []
    crftagger = pycrfsuite.Tagger()
    crftagger.open('baseline_tagger.crfsuite')

    for x in x_test:
        y_pred.append(crftagger.tag(x))

    return y_pred

y_pred = predict(x_test)

In [8]:
def calculate_accuracy(y_pred, y_test):
    correct = 0
    for c_pred, c_test in zip(y_pred, y_test):
        for l1, l2 in zip(c_pred, c_test):
            if(l1 == l2):
                correct += 1
    return correct/sum([len(x) for x in y_test])   

calculate_accuracy(y_pred, y_test)

0.7274062620796289

In [45]:

def get_advanced_features(utterance, first_utterance, last_speaker):
    features = []
    if(last_speaker and utterance.speaker != last_speaker):
        features.append('SPEAKER_CHANGE')
        
    if(first_utterance):
        features.append('FIRST_UTTERANCE')
    
    if(not utterance.pos):
        features.append('NO_WORD')
        return features, utterance.speaker, utterance.act_tag
    
    for i, (token, pos) in enumerate(utterance.pos):
        if(i == 0):
            features.append('SOS_TOKEN_'+token)
            features.append('SOS_POS_'+pos)
        if(i == len(utterance.pos)-1):
            features.append('EOS_TOKEN_'+token)
            features.append('EOS_POS_'+pos)
        
        features.append('TOKEN_'+token)
        features.append('POS_'+pos)
        
    for pos1, pos2 in zip(utterance.pos[:-1], utterance.pos[1:]):
        features.append("BIGRAM_{}_{}".format(pos1.token, pos2.token))
        features.append("BIGRAM_POS_{}_{}".format(pos1.pos, pos2.pos))
    
    return features, utterance.speaker, utterance.act_tag

In [47]:
import time
st = time.time()
x_train, y_train = create_dataset(train_path, advanced=True)
x_test, y_test = create_dataset(test_path, advanced=True)
train(x_train, y_train)
y_pred = predict(x_test)
print(calculate_accuracy(y_pred, y_test))
print(time.time()-st)

0.7541553923463471
330.89393973350525


In [10]:
element = next(get_data(train_path))

In [14]:
sent = element[0].pos

In [16]:
sent[1:]

[PosTag(token='are', pos='VBP'),
 PosTag(token='your', pos='PRP$'),
 PosTag(token='favorite', pos='JJ'),
 PosTag(token='programs', pos='NNS'),
 PosTag(token='?', pos='.')]

In [19]:
bigrams = ["{}_{}".format(x.token, y.token) for (x, y) in zip(sent[:-1], sent[1:])]

In [20]:
bigrams

['What-are', 'are-your', 'your-favorite', 'favorite-programs', 'programs-?']