# Verb Prediction

This notebook transforms the HTB dataset from https://github.com/UniversalDependencies/UD_Hebrew-HTB, into what you need for the verb-prediction assignment. You may equally copy the code into `.py` file and run it directly outside the jupyter environment here (in a command prompt).

### Imports

In [143]:
from tqdm import tqdm_notebook # progress bars
import pyconll # library parsing CoNLL-U files
from pprint import pprint # slightly nicer printing of data structures
from sklearn.metrics import confusion_matrix
import random

## Downloading and loading the HTB corpus
To future-proof, we standardize on a specific version of the HTB. <br>Those with a keen eye will notice we use here the quick-and-dirty ⚡ way of launching OS commands from directly within the notebook. <br>⚙ Anyway, make sure you have git installed and working on your OS before proceeding.

In [None]:
%%script false
!git clone https://github.com/UniversalDependencies/UD_Hebrew-HTB
!cd UD_Hebrew-HTB && git checkout 82591c955e86222e32531336ff23e36c220b5846

In [None]:
conllu_file = 'UD_Hebrew-HTB/he_htb-ud-train.conllu'
conllu = pyconll.load_from_file(conllu_file)

## Quick data exploration
lets quantify how many verbs do we have per sentence

In [None]:
counts = []
for sentence in conllu:
    verbs = 0
    for token in sentence:
        if token.upos == 'VERB':
            verbs += 1 #print(token.form)
    counts.append(verbs)

In [None]:
import pandas as pd
counts = pd.Series(counts)
counts.value_counts().sort_index()

In [139]:
verbs = {}
non_verbs = {}

for sentence in conllu:
    for token in sentence:
        if token.upos == 'VERB':
            if token.form in verbs:
                verbs.update({token.form : verbs[token.form]+1})
            else:
                verbs.update({token.form : 0})
        else:
            if token.form in non_verbs:
                non_verbs.update({token.form : non_verbs[token.form]+1})
            else:
                non_verbs.update({token.form : 0})

    
print('{:,} unique verbs in training data'.format(len(verbs)))
print('{:,} unique non-verbs in training data'.format(len(non_verbs)))

ambiguous = set(verbs.keys()) & set(non_verbs.keys())

print('{:,} words are ambiguous'.format(len(ambiguous)))
print('ambiguous words:\n' + str(ambiguous))

4,984 unique verbs in training data
25,228 unique non-verbs in training data
545 words are ambiguous
ambiguous words:
{'אשמים', 'משנה', 'מוזמן', 'הבהיר', 'פחדה', 'מועדים', 'אוכל', 'חוזה', 'מקומם', 'שותף', 'נוכח', 'חתומות', 'כלל', 'מעורבים', 'חולף', 'קונה', 'מעניין', 'הערים', 'כתבו', 'מרים', 'נדון', 'עוברים', 'סומך', 'מבין', 'חסר', 'הוצאה', 'למתן', 'לנו', 'הבנתי', 'מאמן', 'רודפים', 'יוצאי', 'איים', 'עובדת', 'מרבה', 'לתקן', 'לפנות', 'לגמול', 'מסירות', 'נטל', 'פטר', 'מוליך', 'כלא', 'מתוך', 'מונחים', 'שקועה', 'ידוע', 'לחוקרו', 'מכירה', 'מקבלת', 'מנהל', 'מחלקים', 'הקלה', 'להפכם', 'לדבר', 'מעניינת', 'יוצרים', 'מאמינים', 'להציגה', 'תומכת', 'ניתנים', 'נושא', 'מוכרת', 'נגיד', 'עלה', 'לדעת', 'קדמה', 'ניתנת', 'משך', 'מובנת', 'מכפר', 'שמה', 'קברו', 'כותב', 'פרצה', 'יציע', 'מראיין', 'נבחרים', 'כעס', 'שכבה', 'מזכירות', 'טענה', 'מושלמת', 'שורר', 'נתונים', 'חיים', 'לחצה', 'מספר', 'גורמים', 'דומה', 'לרכוש', 'באות', 'מחייבים', 'מתמחים', 'מצויים', 'לקח', 'לבחור', 'מובילים', 'גאה', 'כרוך', 'עודד', 'נתן', 

In [None]:
characters = dict()

lexicon = list(verbs.keys()) + list(non_verbs.keys())

for verb in tqdm_notebook(lexicon):
    for char in verb:
        if char in characters:
            characters[char] += 1
        else:
            characters[char] = 0        

In [None]:
alphabet = characters.keys()
alphabet = sorted(alphabet)
print('the alhpabet size in this corpus is {}'.format(len(alphabet)))
print('alphabet:\n' + str(alphabet))

In [None]:
letter_index = dict([(letter, idx) for idx, letter in enumerate(alphabet)]) # this is called a list comprehension

def vectorize(word):
    vec = [0] * len(alphabet)
    for char in word:
        vec[letter_index[char]] += 1
    
    return vec

letter_index

In [None]:
def train_test_split(X, y, train_proportion=0.8):
    ''' split the given data into train and test sets '''

    assert len(X) == len(y), 'input data should have exactly one prediction per input'
    assert 0 < train_proportion < 1, 'this function requires a proportion between zero and one as its second argument'
    
    data_indices = set(range(len(X)))
    data_count = len(data_indices)

    train_indices = set(random.sample(data_indices, int(data_count * train_proportion)))
    test_indices  = data_indices - train_indices
    
    X_train = [X[idx] for idx in train_indices]
    X_test  = [X[idx] for idx in test_indices]
    
    y_train = [y[idx] for idx in train_indices]
    y_test  = [y[idx] for idx in test_indices]

    assert len(X_train) + len(X_test) == len(X)
    
    return X_train, X_test, y_train, y_test

In [101]:
from sklearn.naive_bayes import MultinomialNB

X_positive = list(map(vectorize, verbs.keys()))
X_negative = list(map(vectorize, non_verbs.keys()))

y_positive = [1] * len(X_positive)
y_negative = [0] * len(X_negative)

X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(X_positive, y_positive)
X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(X_negative, y_negative)

X_train = X_train_pos + X_train_neg
y_train = y_train_pos + y_train_neg

In [102]:
clf = MultinomialNB()

clf.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [124]:
dict(zip(letter_index.keys(), clf.coef_[0]))

{'!': -9.913586387213542,
 '"': -9.913586387213542,
 '%': -9.913586387213542,
 '(': -9.913586387213542,
 ')': -9.913586387213542,
 ',': -9.913586387213542,
 '-': -9.913586387213542,
 '.': -9.913586387213542,
 '0': -9.913586387213542,
 '1': -9.913586387213542,
 '2': -9.913586387213542,
 '3': -9.913586387213542,
 '4': -9.913586387213542,
 '5': -9.913586387213542,
 '6': -9.913586387213542,
 '7': -9.913586387213542,
 '8': -9.913586387213542,
 '9': -9.913586387213542,
 ':': -9.913586387213542,
 ';': -9.913586387213542,
 '?': -9.913586387213542,
 '_': -9.913586387213542,
 'א': -3.8757154672914043,
 'ב': -3.3525557213169686,
 'ג': -4.000083381575272,
 'ד': -3.727377763313048,
 'ה': -2.5579452842392882,
 'ו': -2.197571120570955,
 'ז': -4.510909005341262,
 'ח': -3.4259023687289316,
 'ט': -4.282374605392176,
 'י': -2.199801770614787,
 'ך': -5.636920268197486,
 'כ': -3.806563499471287,
 'ל': -2.767601919499154,
 'ם': -3.5767606560671004,
 'מ': -2.704246130610632,
 'ן': -5.093304821608505,
 'נ': -

In [136]:
word = 'ץץץץץץץץץץץץץץץץץץץץץץ'
word = '12324983249898234'
clf.predict([vectorize(word)])

array([0])

In [145]:
X_test = X_test_pos + X_test_neg
y_test = y_test_pos + y_test_neg

y_pred = clf.predict(X_test)

In [155]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.83      1.00      0.91      5046
          1       0.00      0.00      0.00       997

avg / total       0.70      0.83      0.76      6043



In [156]:
confusion_matrix(y_test, y_pred)

array([[5042,    4],
       [ 997,    0]])