In [1]:
train = open('data/gikrya_new_train.out')
new_train = open('data/train_pos.out', 'w')

for line in train:
    if line[0].isdigit():
        i,word,lemma,pos, *_ = line.split('\t')
        new_train.write('\t'.join([word, lemma, pos]) + '\n')
    else:
        new_train.write(line)

new_train.close()

In [2]:
test = open('data/gikrya_new_test.out')
new_test = open('data/test_pos.out', 'w')

for line in test:
    if line[0].isdigit():
        i,word,lemma,pos, *_ = line.split('\t')
        new_test.write('\t'.join([word, lemma, pos]) + '\n')
    else:
        new_test.write(line)

new_test.close()

In [40]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [12]:
train = open('data/train_pos.out').read().split('\n\n')[:5000]
test = open('data/test_pos.out').read().split('\n\n')[:1000]

In [13]:
train_data = []
for sent in train:
    sent_list = [['<START>', '<START>', '<START>']]
    for line in sent.split('\n'):
        sent_list.append(line.split('\t'))
    
    train_data.append(sent_list)

test_data = []
for sent in test:
    sent_list = [['<START>', '<START>', '<START>']]
    for line in sent.split('\n'):
        sent_list.append(line.split('\t'))
    
    test_data.append(sent_list)

In [14]:
train_current_word = []
# train_current_lemma = []
train_previous_word = []
# train_previous_lemma = []
train_previous_pos = []
train_target = []

for sent in train_data:
    
    for i in range(1, len(sent)):
        current_w, current_l, target_pos = sent[i]
        previous_w, previous_l, previous_pos = sent[i-1]
        
        train_target.append(target_pos)
        train_current_word.append(current_w)
#         train_current_lemma.append(current_l)
        train_previous_word.append(previous_w)
#         train_previous_lemma.append(previous_l)
        train_previous_pos.append(previous_pos)
        
test_current_word = []
# test_current_lemma = []
test_previous_word = []
# test_previous_lemma = []
test_previous_pos = []
test_target = []

for sent in test_data:
    
    for i in range(1, len(sent)):
        current_w, current_l, target_pos = sent[i]
        previous_w, previous_l, previous_pos = sent[i-1]
        
        test_target.append(target_pos)
        test_current_word.append(current_w)
#         test_current_lemma.append(current_l)
        test_previous_word.append(previous_w)
#         test_previous_lemma.append(previous_l)
        test_previous_pos.append(previous_pos)  

In [8]:
train_target[:10]

['DET', 'NOUN', 'VERB', 'PRON', 'ADP', 'NOUN', 'PUNCT', 'PRON', 'PART', 'VERB']

In [15]:
# PREVIOUS POS ENCODING
lenc_prev_pos = LabelEncoder()
int_prev_pos_enc = lenc_prev_pos.fit_transform(train_previous_pos)
onehot_prev_pos = OneHotEncoder(sparse=True)
int_prev_pos = int_prev_pos_enc.reshape(len(int_prev_pos_enc), 1)

X_prev_pos_train = onehot_prev_pos.fit_transform(int_prev_pos)

int_prev_pos_enc_test = lenc_prev_pos.transform(test_previous_pos)
int_prev_pos_test = int_prev_pos_enc_test.reshape(
                                     len(int_prev_pos_enc_test),1)

X_prev_pos_test = onehot_prev_pos.transform(int_prev_pos_test)

# PREVIOUS and CURRENT WORD ENCODING
cv_word = CountVectorizer(ngram_range=(1,3), analyzer='char', max_features=3000)
cv_word.fit(train_previous_word + train_current_word)

X_prev_word_train = cv_word.transform(train_previous_word)
X_current_word_train = cv_word.transform(train_current_word)

X_prev_word_test = cv_word.transform(test_previous_word)
X_current_word_test = cv_word.transform(test_current_word)

# PREVIOUS and CURRENT LEMMA ENCODING
# cv_lemma = CountVectorizer(ngram_range=(1,1), max_features=10000)
# cv_lemma.fit(train_previous_lemma + train_current_lemma)

# X_prev_lemma_train = cv_lemma.transform(train_previous_lemma)
# X_current_lemma_train = cv_lemma.transform(train_current_lemma)

# X_prev_lemma_test = cv_lemma.transform(test_previous_lemma)
# X_current_lemma_test = cv_lemma.transform(test_current_lemma)

In [16]:
X_train = hstack([X_prev_pos_train, X_prev_word_train,
                  X_current_word_train])

In [17]:
print(X_train.shape)

(64888, 6014)


In [18]:
X_test = hstack([X_prev_pos_test, X_prev_word_test,
                  X_current_word_test])

In [19]:
print(X_test.shape)

(12745, 6014)


In [20]:
clf = LogisticRegression()
clf.fit(X_train, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
pred_pos = clf.predict(X_test)
print(classification_report(test_target, pred_pos))

             precision    recall  f1-score   support

        ADJ       0.92      0.90      0.91      1154
        ADP       0.96      0.99      0.97      1155
        ADV       0.89      0.87      0.88       814
       CONJ       0.86      0.95      0.90       752
        DET       0.88      0.89      0.88       490
          H       0.94      0.65      0.77        49
       INTJ       0.71      0.28      0.40        18
       NOUN       0.95      0.95      0.95      2857
        NUM       0.92      0.84      0.88       144
       PART       0.96      0.86      0.91       471
       PRON       0.91      0.92      0.91       911
      PUNCT       1.00      1.00      1.00      2139
       VERB       0.96      0.96      0.96      1791

avg / total       0.94      0.94      0.94     12745



In [30]:
# PREVIOUS POS ENCODING

pred_pos_fair = []
true_pos = []

def vectorize_word(word, prev_word, prev_pos):
    int_prev_pos_enc = lenc_prev_pos.transform(
                                   [prev_pos])
    int_prev_pos = int_prev_pos_enc.reshape(
                                         len(int_prev_pos_enc),1)

    X_prev_pos = onehot_prev_pos.transform(int_prev_pos)

    X_prev_word = cv_word.transform([prev_word])
    X_current_word = cv_word.transform([word])

#         X_prev_lemma = cv_lemma.transform([previous_l])
#         X_current_lemma = cv_lemma.transform([current_l])

    X = hstack([X_prev_pos, X_prev_word,
              X_current_word])

    pred = clf.predict(X)[0]
    
    return pred

for sent in test_data[:1000]:
    previous_pos = sent[0][2]
    pos_sequence = []
    
    for i in range(1, len(sent)):
        current_w, current_l, target_pos = sent[i]
        previous_w, previous_l, _ = sent[i-1]
        true_pos.append(target_pos)
        
        pred = vectorize_word(current_w, previous_w, previous_pos)
        previous_pos = pred
        pos_sequence.append(pred)
        
    pred_pos_fair += pos_sequence
        

In [32]:
def predict_pos(word, prev_word, prev_pos):
    int_prev_pos_enc = lenc_prev_pos.transform(
                                   [prev_pos])
    int_prev_pos = int_prev_pos_enc.reshape(
                                         len(int_prev_pos_enc),1)

    X_prev_pos = onehot_prev_pos.transform(int_prev_pos)

    X_prev_word = cv_word.transform([prev_word])
    X_current_word = cv_word.transform([word])

#         X_prev_lemma = cv_lemma.transform([previous_l])
#         X_current_lemma = cv_lemma.transform([current_l])

    X = hstack([X_prev_pos, X_prev_word,
              X_current_word])

    pred = clf.predict_proba(X)[0]
    
    return pred

In [36]:
clf.classes_[10]

'PRON'

In [31]:
print(classification_report(true_pos, pred_pos_fair))

             precision    recall  f1-score   support

        ADJ       0.92      0.90      0.91      1154
        ADP       0.96      0.99      0.97      1155
        ADV       0.90      0.87      0.88       814
       CONJ       0.86      0.95      0.90       752
        DET       0.88      0.89      0.88       490
          H       0.94      0.65      0.77        49
       INTJ       0.71      0.28      0.40        18
       NOUN       0.95      0.95      0.95      2857
        NUM       0.92      0.85      0.88       144
       PART       0.95      0.87      0.91       471
       PRON       0.91      0.92      0.91       911
      PUNCT       1.00      1.00      1.00      2139
       VERB       0.96      0.96      0.96      1791

avg / total       0.94      0.94      0.94     12745



In [27]:
test_current_word[0]

'Любая'

In [44]:
test_current_word[:10]

['Любая',
 'страна',
 'имеет',
 'право',
 'задержать',
 'на',
 'границе',
 'любого',
 'человека',
 'и']

In [28]:
errors = []
for i in range(len(true_pos)):
    if true_pos[i] != pred_pos_fair[i]:
        errors.append((true_pos[i], pred_pos_fair[i], test_current_word[i]))
    

In [29]:
errors

[('PRON', 'DET', 'его'),
 ('ADV', 'DET', 'все'),
 ('ADJ', 'NOUN', 'рабочий'),
 ('PRON', 'CONJ', 'Что'),
 ('CONJ', 'NOUN', 'Раз'),
 ('PRON', 'DET', 'все'),
 ('ADJ', 'ADV', 'равно'),
 ('NOUN', 'VERB', 'выстрел'),
 ('PART', 'CONJ', 'и'),
 ('PRON', 'CONJ', 'что'),
 ('ADV', 'NOUN', 'зря'),
 ('PART', 'CONJ', 'и'),
 ('PART', 'CONJ', 'и'),
 ('PRON', 'CONJ', 'что'),
 ('NOUN', 'ADJ', 'взрослых'),
 ('ADV', 'NOUN', 'еле'),
 ('CONJ', 'ADV', 'словно'),
 ('H', 'ADV', 'наверно'),
 ('ADV', 'ADP', 'назад'),
 ('NOUN', 'VERB', 'пугало'),
 ('H', 'ADV', 'наверно'),
 ('NOUN', 'ADJ', 'хуйню'),
 ('VERB', 'NOUN', 'пишет'),
 ('NOUN', 'VERB', 'мол'),
 ('VERB', 'PRON', 'умеем'),
 ('PART', 'CONJ', 'и'),
 ('INTJ', 'NOUN', 'блин'),
 ('ADJ', 'ADV', 'возможно'),
 ('ADV', 'CONJ', 'как'),
 ('INTJ', 'CONJ', 'Ага'),
 ('ADJ', 'VERB', 'могучего'),
 ('PRON', 'CONJ', 'Что'),
 ('DET', 'PRON', 'того'),
 ('PRON', 'CONJ', 'что'),
 ('H', 'VERB', 'Может'),
 ('ADJ', 'DET', 'многие'),
 ('CONJ', 'ADV', 'как'),
 ('DET', 'PRON', 'все'),


In [42]:
[-np.log(x) for x in predict_pos('Чей', '<START>', '<START>')]

8.601719112782513

In [37]:
predict_pos(obs[0], '<START>', '<START>')

NameError: name 'obs' is not defined

In [46]:
def viterbi(obs, states):
    V = [{}]
    
    pred_init_states = [-np.log(x) for x in predict_pos(obs[0], '<START>', '<START>')]
    
    for i in range(len(states)):
        V[0][states[i]] = {"prob": pred_init_states[i], "prev": None}
    # Run Viterbi when t > 0
    
    
    for t in range(1, len(obs)):
        V.append({})
        
        max_tr_prob = 0
        prev_state = None
            
        for prev_st in states:
            prob = V[t-1][prev_st]["prob"]*predict_pos(obs[t], obs[t-1], prev_st)
            if prob == max_tr_prob:
                max_prob = prob
                prev_state = prev_st
        
        V[t]['prob'] = max_tr_prob
        V[t]['prev'] = prev_state
            
    for line in dptable(V):
        print(line)
    opt = []
    # The highest probability
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None
    # Get most probable state and its backtrack
    for st, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(st)
            previous = st
            break
    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]

    print('The steps of states are ' + ' '.join(opt) + ' with highest probability of %s' % max_prob)

def dptable(V):
    # Print a table of steps from dictionary
    yield " ".join(("%12d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)

In [47]:
viterbi(test_current_word[:10], clf.classes_)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [53]:
obs, states = test_current_word[:10], clf.classes_

V = [{}]

pred_init_states = [-np.log(x) for x in predict_pos(obs[0], '<START>', '<START>')]

for i in range(len(states)):
    V[0][states[i]] = {"prob": pred_init_states[i], "prev": None}
# Run Viterbi when t > 0


for t in range(1, len(obs)):
    V.append({})

    
    cur_state = None

    for st in states:
        probs = [-np.log(x) for x in predict_pos(obs[t], obs[t-1], st)]
        max_tr_prob = 0
        prev_state = None
        for i in range(len(states)):
            prob = V[t-1][prev_st]["prob"]*probs[i]
            if prob > max_tr_prob:
                max_prob = prob
                prev_state = prev_st
                cur_state = states[i]

    V[t][cur_state] = {'prob':max_tr_prob,'prev':prev_state}

for line in dptable(V):
    print(line)
opt = []
# The highest probability
max_prob = max(value["prob"] for value in V[-1].values())
previous = None
# Get most probable state and its backtrack
for st, data in V[-1].items():
    if data["prob"] == max_prob:
        opt.append(st)
        previous = st
        break
# Follow the backtrack till the first observation
for t in range(len(V) - 2, -1, -1):
    opt.insert(0, V[t + 1][previous]["prev"])
    previous = V[t + 1][previous]["prev"]

print('The steps of states are ' + ' '.join(opt) + ' with highest probability of %s' % max_prob)

KeyError: 'ADJ'

In [52]:
cur_states

NameError: name 'cur_states' is not defined