# Verb Prediction ― bag of letters | without ambiguous ones

This notebook transforms the HTB dataset from https://github.com/UniversalDependencies/UD_Hebrew-HTB, into what you need for the verb-prediction assignment. You may equally copy the code into `.py` file and run it directly outside the jupyter environment here (in a command prompt).

### Imports

In [143]:
from tqdm import tqdm_notebook # progress bars
import pyconll # library parsing CoNLL-U files
from pprint import pprint # slightly nicer printing of data structures
from sklearn.metrics import confusion_matrix
import random

## Downloading and loading the HTB corpus
To future-proof, we standardize on a specific version of the HTB. <br>Those with a keen eye will notice we use here the quick-and-dirty ⚡ way of launching OS commands from directly within the notebook. <br>⚙ Anyway, make sure you have git installed and working on your OS before proceeding.

In [None]:
%%script false
!git clone https://github.com/UniversalDependencies/UD_Hebrew-HTB
!cd UD_Hebrew-HTB && git checkout 82591c955e86222e32531336ff23e36c220b5846

In [None]:
conllu_file = 'UD_Hebrew-HTB/he_htb-ud-train.conllu'
conllu = pyconll.load_from_file(conllu_file)

## Quick data exploration
lets quantify how many verbs do we have per sentence

In [None]:
counts = []
for sentence in conllu:
    verbs = 0
    for token in sentence:
        if token.upos == 'VERB':
            verbs += 1 #print(token.form)
    counts.append(verbs)

In [None]:
import pandas as pd
counts = pd.Series(counts)
counts.value_counts().sort_index()

In [None]:
verbs = {}
non_verbs = {}

for sentence in conllu:
    for token in sentence:
        if token.upos == 'VERB':
            if token.form in verbs:
                verbs.update({token.form : verbs[token.form]+1})
            else:
                verbs.update({token.form : 0})
        else:
            if token.form in non_verbs:
                non_verbs.update({token.form : non_verbs[token.form]+1})
            else:
                non_verbs.update({token.form : 0})

    
print('{:,} unique verbs in training data'.format(len(verbs)))
print('{:,} unique non-verbs in training data'.format(len(non_verbs)))

ambiguous = set(verbs.keys()) & set(non_verbs.keys())

print('{:,} words are ambiguous'.format(len(ambiguous)))
print()
#print('ambiguous words:\n' + str(ambiguous))

In [172]:
verbs     = {k:v for (k,v) in verbs.items() if not k in ambiguous} # this is called a dict comprehension
non_verbs = {k:v for (k,v) in non_verbs.items() if not k in ambiguous}

print('after removing ambiguous words:\n')
print('{:,} unique verbs in training data'.format(len(verbs)))
print('{:,} unique non-verbs in training data'.format(len(non_verbs)))

after removing ambiguous words:

4,439 unique verbs in training data
24,683 unique non-verbs in training data


In [173]:
characters = dict()

lexicon = list(verbs.keys()) + list(non_verbs.keys())

for verb in tqdm_notebook(lexicon):
    for char in verb:
        if char in characters:
            characters[char] += 1
        else:
            characters[char] = 0        

HBox(children=(IntProgress(value=0, max=29122), HTML(value='')))




In [174]:
alphabet = characters.keys()
alphabet = sorted(alphabet)
print('the alhpabet size in this corpus is {}'.format(len(alphabet)))
print('alphabet:\n' + str(alphabet))

the alhpabet size in this corpus is 49
alphabet:
['!', '"', '%', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '_', 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן', 'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק', 'ר', 'ש', 'ת']


In [175]:
letter_index = dict([(letter, idx) for idx, letter in enumerate(alphabet)]) # this is called a list comprehension

def vectorize(word):
    ''' turn word into a vector '''
    
    # letter occurences
    vec = [0] * len(alphabet)
    for char in word:
        vec[letter_index[char]] += 1
    
    # word length
    vec.append(len(word))
        
    return vec

letter_index

{'!': 0,
 '"': 1,
 '%': 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '-': 6,
 '.': 7,
 '0': 8,
 '1': 9,
 '2': 10,
 '3': 11,
 '4': 12,
 '5': 13,
 '6': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 ';': 19,
 '?': 20,
 '_': 21,
 'א': 22,
 'ב': 23,
 'ג': 24,
 'ד': 25,
 'ה': 26,
 'ו': 27,
 'ז': 28,
 'ח': 29,
 'ט': 30,
 'י': 31,
 'ך': 32,
 'כ': 33,
 'ל': 34,
 'ם': 35,
 'מ': 36,
 'ן': 37,
 'נ': 38,
 'ס': 39,
 'ע': 40,
 'ף': 41,
 'פ': 42,
 'ץ': 43,
 'צ': 44,
 'ק': 45,
 'ר': 46,
 'ש': 47,
 'ת': 48}

In [176]:
def train_test_split(X, y, train_proportion=0.8):
    ''' split the given data into train and test sets '''

    assert len(X) == len(y), 'input data should have exactly one prediction per input'
    assert 0 < train_proportion < 1, 'this function requires a proportion between zero and one as its second argument'
    
    data_indices = set(range(len(X)))
    data_count = len(data_indices)

    train_indices = set(random.sample(data_indices, int(data_count * train_proportion)))
    test_indices  = data_indices - train_indices
    
    X_train = [X[idx] for idx in train_indices]
    X_test  = [X[idx] for idx in test_indices]
    
    y_train = [y[idx] for idx in train_indices]
    y_test  = [y[idx] for idx in test_indices]

    assert len(X_train) + len(X_test) == len(X)
    
    return X_train, X_test, y_train, y_test

In [177]:
from sklearn.naive_bayes import MultinomialNB

X_positive = list(map(vectorize, verbs.keys()))
X_negative = list(map(vectorize, non_verbs.keys()))

y_positive = [1] * len(X_positive)
y_negative = [0] * len(X_negative)

X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(X_positive, y_positive)
X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(X_negative, y_negative)

X_train = X_train_pos + X_train_neg
y_train = y_train_pos + y_train_neg

In [178]:
clf = MultinomialNB()

clf.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [179]:
dict(zip(letter_index.keys(), clf.coef_[0]))

{'!': -10.502928266655765,
 '"': -10.502928266655765,
 '%': -10.502928266655765,
 '(': -10.502928266655765,
 ')': -10.502928266655765,
 ',': -10.502928266655765,
 '-': -10.502928266655765,
 '.': -10.502928266655765,
 '0': -10.502928266655765,
 '1': -10.502928266655765,
 '2': -10.502928266655765,
 '3': -10.502928266655765,
 '4': -10.502928266655765,
 '5': -10.502928266655765,
 '6': -10.502928266655765,
 '7': -10.502928266655765,
 '8': -10.502928266655765,
 '9': -10.502928266655765,
 ':': -10.502928266655765,
 ';': -10.502928266655765,
 '?': -10.502928266655765,
 '_': -10.502928266655765,
 'א': -4.589425261017495,
 'ב': -4.097699808624923,
 'ג': -4.673982649045557,
 'ד': -4.443805071073968,
 'ה': -3.2395986491789284,
 'ו': -2.87343835026177,
 'ז': -5.2046109001077285,
 'ח': -4.148558225858414,
 'ט': -4.900809445776064,
 'י': -2.8900972362484083,
 'ך': -6.359793540264232,
 'כ': -4.547090897190934,
 'ל': -3.4347562662677227,
 'ם': -4.352325498209486,
 'מ': -3.4364612965188073,
 'ן': -5.848

In [180]:
word = 'ץץץץץץץץץץץץץץץץץץץץץץ'
word = '12324983249898234'
clf.predict([vectorize(word)])

array([0])

In [181]:
X_test = X_test_pos + X_test_neg
y_test = y_test_pos + y_test_neg

y_pred = clf.predict(X_test)

In [182]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.85      1.00      0.92      4937
          1       0.00      0.00      0.00       888

avg / total       0.72      0.85      0.78      5825



In [183]:
confusion_matrix(y_test, y_pred)

array([[4935,    2],
       [ 888,    0]])