In [1]:
import numpy as np
import nltk
from nltk.corpus import treebank, brown
import pdb

## Task 1

1. **Neural Networks** - Discriminative, as Neural Networks are used to classify outputs i.e. assign probability given an input as opposed to compute a joint distribution og target variables.
2. **Naive Bayes Classifier** - Generative, it may seem that Naive Bayes is discriminative as it models *p(y|X)*, but in doing so it utilizes *p(X, y)*. Hence Naive Bayes is generative.
3. **Logistic Regression** - Discriminative, as it directly models *p(y|X)* as a function of X.
4. **Gaussian Mixture Model** - Generative, in GMM, we model *p(X|y)* as a gaussian and hence, we are implicitly computing *p(X, y)* which is given as _p(y) * p(X|y)_.
5. **GAN** - Both generative and discriminative, the generator part models input distribution whereas the discriminator tells whether the data is coming from *real* input distribution or from generator.
6. **LDA** - Generative, as it models the documents and the words using a set of unboserved groups(topics)
7. **SVM** - Discriminative, similar as logistic regression, optimization method is different and models *p(y|X)*.
8. **Decision Trees** - Discriminative, in a decision tree, we directly model the true class label y given input x, by generating a decision tree

## Task 2

In [2]:
corpus = brown.tagged_sents(tagset='universal')[:-100] 
test_corpus = brown.tagged_sents(tagset='universal')[-100:]

tag_dict = {}
word_dict = {}

for sent in corpus:
    for elem in sent:
        w = elem[0]
        tag = elem[1]

        if w not in word_dict:
            word_dict[w]= 0

        if tag not in tag_dict:
            tag_dict[tag] = 0

        word_dict[w] += 1
        tag_dict[tag] += 1

print(len(word_dict))
print(len(tag_dict))
        
test_data = brown.tagged_sents(tagset='universal')[-10:]

print(len(test_data))

k = 0.001

55907
12
10


In [3]:
# Building Start Dict

ind_tag = []
tag_ind = {}
start_matrix = {}

for tag in tag_dict:
    ind_tag += [tag]

for ind, tag in enumerate(ind_tag):
    tag_ind[tag] = ind

start_tag_counts = {}
total_sentences = 0

for sent in corpus:
    for elem in sent:
        start_tag_counts[elem[1]] = start_tag_counts.get(elem[1], 0) + 1
        break
    total_sentences += 1

for tag in tag_ind:
    start_matrix[tag] = start_tag_counts[tag] / total_sentences
start_matrix

{'.': 0.08901118099231307,
 'ADJ': 0.0343466107617051,
 'ADP': 0.12283368273934311,
 'ADV': 0.09117749825296995,
 'CONJ': 0.04916142557651992,
 'DET': 0.21339972047519218,
 'NOUN': 0.14129979035639412,
 'NUM': 0.016788958770090845,
 'PRON': 0.15971348707197763,
 'PRT': 0.03665269042627533,
 'VERB': 0.045090845562543676,
 'X': 0.0005241090146750524}

In [4]:
# Building Emission Matrix

tag_word_counts = {}
emission_matrix = {}

for sent in corpus:
    for elem in sent:
        tag_word_counts[elem[1]] = tag_word_counts.get(elem[1], {})
        tag_word_counts[elem[1]][elem[0]] = tag_word_counts[elem[1]].get(elem[0], 0)
        tag_word_counts[elem[1]][elem[0]] += 1

for tag in tag_dict:
    emission_matrix[tag] = {}
    for word in word_dict:
        emission_matrix[tag][word] = (tag_word_counts[tag].get(word, 0) + k) / (k * len(word_dict) + tag_dict[tag])
emission_matrix['NOUN']['tent']

7.2696303799849e-05

In [5]:
# Building Transition Matrix

tag_tag_count = {}
transition_matrix = {}

for sent in corpus:
    for i in range(len(sent) - 1):
        tag_tag_count[sent[i][1]] = tag_tag_count.get(sent[i][1], {})
        tag_tag_count[sent[i][1]][sent[i + 1][1]] = tag_tag_count[sent[i][1]].get(sent[i + 1][1], 0)
        tag_tag_count[sent[i][1]][sent[i + 1][1]] += 1

for tag1 in tag_dict:
    transition_matrix[tag1] = {}
    for tag2 in tag_dict:
        transition_matrix[tag1][tag2] = (tag_tag_count[tag1].get(tag2, 0) + k) / (k * len(tag_dict) + sum(tag_tag_count[tag1].values()))

sum(transition_matrix['.'].values())

1.0

In [11]:
# Viterbi Algorithm

def viterbi(sent):
    viterbi_dp = {}
    viterbi_dp[0] = {}
    backpointer = {}
    backpointer[0] = {}
    
    for tag in tag_dict:
        viterbi_dp[0][tag] = start_matrix[tag] * emission_matrix[tag].get(sent[0][0], k / float(k * len(word_dict) + tag_dict[tag]))
        backpointer[0][tag] = -1

    for i, elem in enumerate(sent[1:], 1):
        viterbi_dp[i] = {}
        backpointer[i] = {}
        for tag in tag_dict:
            viterbi_dp[i][tag] = emission_matrix[tag].get(elem[0], k / float(k * len(word_dict) + tag_dict[tag])) * (max(transition_matrix[tag_before][tag] * viterbi_dp[i - 1][tag_before] for tag_before in tag_dict))
            backpointer[i][tag] = max([[transition_matrix[tag_before][tag] * viterbi_dp[i - 1][tag_before], tag_before] for tag_before in tag_dict], key=lambda k: k[0])[1]
    i = len(sent) - 1
    best_state_pointer = max(viterbi_dp[i], key=viterbi_dp[i].get)
    path = [best_state_pointer]
    while i != -1:
        path += [backpointer[i][best_state_pointer]]
        best_state_pointer = backpointer[i][best_state_pointer]
        i -= 1
    path.reverse()
    return path

total = 0
total_correct = 0

for sent in test_corpus:
    pred = viterbi(sent)
    total_correct += sum([x == y[1] for x, y in zip(pred[1:], sent)])
    total += len(sent)

print("Accuracy: ", total_correct / total * 100)
    
for i, sent in enumerate(test_data):
    pred = viterbi(sent)
    print(i, ":")
    print(sent)
    print("Prediction - ", pred[1:])


Accuracy:  92.27215455690886
0 :
[('you', 'PRON'), ("can't", 'VERB'), ('very', 'ADV'), ('well', 'ADV'), ('sidle', 'VERB'), ('up', 'ADP'), ('to', 'ADP'), ('people', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('street', 'NOUN'), ('and', 'CONJ'), ('ask', 'VERB'), ('if', 'ADP'), ('they', 'PRON'), ('want', 'VERB'), ('to', 'PRT'), ('buy', 'VERB'), ('a', 'DET'), ('hot', 'ADJ'), ('Bodhisattva', 'NOUN'), ('.', '.')]
Prediction -  ['PRON', 'VERB', 'ADV', 'ADV', 'VERB', 'PRT', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'CONJ', 'VERB', 'ADP', 'PRON', 'VERB', 'PRT', 'VERB', 'DET', 'ADJ', 'NOUN', '.']
1 :
[('Additionally', 'ADV'), (',', '.'), ('since', 'ADP'), ("you're", 'PRT'), ('going', 'VERB'), ('to', 'PRT'), ('be', 'VERB'), ('hors', 'X'), ('de', 'X'), ('combat', 'X'), ('pretty', 'ADV'), ('soon', 'ADV'), ('with', 'ADP'), ('sprue', 'NOUN'), (',', '.'), ('yaws', 'NOUN'), (',', '.'), ('Delhi', 'NOUN'), ('boil', 'NOUN'), (',', '.'), ('the', 'DET'), ('Granville', 'NOUN'), ('wilt', 'NOUN'), (',', '.'), ('liver'

## Task 3

In [13]:
# pip3 install sklearn-crfsuite # install this please

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.tag.util import untag

train_sents= corpus

def word2features(sentence,index):
    word = sentence[index]
    
    features ={
        'bias': 1.0,
        'word': sentence[index], #Justification - Obviously, word is going to be a feature
        'is_first': index == 0, #Justification - Some words tend to occur more as a first word (like a, the etc.)
        'is_capitalized': sentence[index][0].upper() == sentence[index][0], #Justification- Nouns are more likely to start with capitalized letters
        'is_all_caps': sentence[index][0].upper() == sentence[index][0], #Justification - Abbreviations generally have all caps and are generally nouns
        'is_all_lower': sentence[index][0].lower() == sentence[index][0], #Justification - Would be not a proper noun
        'prefix-1': sentence[index][0], #Justification - for capturing the stem of the word
        'prefix-2': sentence[index][:2], #Justification - same as above
        'prefix-3': sentence[index][:3], #Justification - same as above
        'suffix-1': sentence[index][-1], #Justification - suffixes like 'y' is more common with Adjectives, etc
        'suffix-2': sentence[index][-2:], #Justification - suffixes like 'al'(rebuttal) more common with Nouns and Adejctives(capable),etc
        'suffix-3': sentence[index][-3:], #Justification - suffixes like 'ing' occur only in tags like Verbs, etc
        'prev_word': '' if index == 0 else sentence[index - 1], #Justification - There may be dependecies on the prev word
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], #Justification - The next word becomes important in cases like of adjectives because if the next word is a Noun then it is more likely to be an adjective
        'has_hyphen': '-' in sentence[index], #Justification - Hyphenated words are more likely to be Adjectives or Nouns and cannot be prepositions, conjunctions
        'is_numeric': sentence[index][0].isdigit(), #Justification - Numbers can be identified using this feature(NUM tag)
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] #Justification - Such words are usually proper nouns
    }
                
    return features

def sent2features(sent):
    return [word2features(untag(sent),i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for i,label in sent]



In [14]:
X_train=[sent2features(s) for s in train_sents]
y_train=[sent2labels(s) for s in train_sents]

X_test=[sent2features(s) for s in test_corpus]
y_test=[sent2labels(s) for s in test_corpus]

In [15]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [16]:
y_pred = crf.predict(X_test)
labels=list(crf.classes_)

metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.95826906914597854

In [17]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

total_correct = 0
total = 0
for pred, real in zip(y_pred, y_test):
    total_correct += sum([s1 == s2 for s1, s2 in zip(pred, real)])
    total += len(pred)
total_correct / total

             precision    recall  f1-score   support

          .      1.000     1.000     1.000       334
          X      1.000     0.176     0.300        17
        ADJ      0.881     0.900     0.890       140
        ADP      0.968     0.975     0.972       283
        ADV      0.901     0.879     0.890       124
       VERB      0.975     0.941     0.957       370
        DET      0.997     1.000     0.998       295
       CONJ      1.000     0.988     0.994        84
       NOUN      0.926     0.963     0.944       483
       PRON      1.000     0.994     0.997       160
        PRT      0.883     0.971     0.925        70
        NUM      0.952     0.952     0.952        21

avg / total      0.961     0.960     0.958      2381



0.9601007979840404