## Imports and Initialisations

In [1]:
import nltk
nltk.download('stopwords')

from nltk.corpus import brown, stopwords
import xml.etree.ElementTree as ET

sents = brown.sents()
stopwords = set(stopwords.words('english'))
tree = ET.parse('../data/test/subtask1-homographic-test.xml')
puncts = list()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sounak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
F = open("../data/test/subtask1-homographic-test.gold")
scores = list()
for item in F.readlines():
    ans = item.split()[1]
    scores.append(ans)

In [3]:
test_sents = list()
root = tree.getroot()
for text in root.getchildren():
    sent = list()
    for word in text.getchildren():
        sent.append(word.text)
    test_sents.append(sent)

In [4]:
def make_ordered_pairs(sent):
    sent = [w for w in sent if not w.lower() in stopwords]
    ans = list()
    for index, word in enumerate(sent):
        temp = index
        while(temp != len(sent) - 1):
            ans.append((sent[index], sent[temp]))
            temp += 1
    return ans

In [5]:
from collections import Counter

words = Counter([])
bigrams = Counter([])

for sent in sents:
    pairs = make_ordered_pairs(sent)
    words.update(sent)
    bigrams.update(pairs)
    
print(words.most_common(10))
print(bigrams.most_common(10))

[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a', 21881), ('in', 19536), ('that', 10237), ('is', 10011)]
[((',', ','), 113930), (('``', '``'), 10166), (("''", "''"), 9976), (('``', "''"), 8109), (('``', ','), 7771), ((',', "''"), 7588), ((',', '``'), 5808), (("''", ','), 5078), (('--', '--'), 4234), (('one', 'one'), 3074)]


## Calculating PMI

The PMI (Pointwise Mututal Information) is being calculated as:
```example
PMI(w1, w2) = log (P(w1, w2) / (P(w1) * P(w2)))
```

In [97]:
import math

def PMI(tup):
    try:
        val = bigrams[tup] / (words[tup[0]] * words[tup[1]])
        return math.log(val)
    except:
        return 0
    
def median(lst):
    l = len(lst)
    if l == 1:
        return lst[0]
    if l == 2:
        return (lst[0] + lst[1]) / 2
    if l % 2 == 1:
        return lst[int(l / 2)]
    else:
        return (lst[int(l / 2)] + lst[int(l / 2) + 1]) / 2
    
def IQR(lst):
    l = len(lst)
    m = int(l / 2)
    if l % 2 == 0:
        return median(lst[m:]) - median(lst[:m])
    else:
        return median(lst[m+1:]) - median(lst[:m])

## Calculating Threshold

The threshold for the difference in the highest PMI and is calculated using the Interquartile Range (IQR). IQR is preferred because it is able to eliminate outliers.

In [118]:
IQRs = list()
for sent in sents:
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([PMI(_) for _ in pairs])
    l = len(PMIs)
    if l > 2:
        m = int(l / 2)
        if l % 2 == 0:
            IQRs.append(IQR(PMIs))
        else:
            IQRs.append(IQR(PMIs))
        
IQRs = sorted(IQRs)
threshold = median(IQRs)
print('\t', threshold)

	 4.1262343528403305


## Testing Sentences for Pun

In [122]:
scores_test = list()
for i, sent in enumerate(test_sents):
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([(_, PMI(_)) for _ in pairs], key=lambda k : k[1])
    l = len(PMIs)
    if l > 2:
        if IQR([v for k, v in PMIs]) > threshold:
            print(PMIs[-1])
        else:
            scores_test.append(0)
    else:
        scores_test.append(0)

(('forge', 'ahead'), 0)
(('believe', 'stood'), 0)
(('huge', 'hit'), 0)
(('time', 'vault'), 0)
(('examples', 'move'), 0)
(('easy', 'age'), 0)
(('Tom', 'admitted'), 0)
(('manager', "'"), 0)
(('Take', 'Leave'), 0)
(('draw', 'fast'), 0)
(('Tom', 'snorted'), 0)
(('burning', 'questions'), 0)
(('could', 'spell'), 0)
(('shot', 'opportunities'), 0)
(('around', 'beach'), 0)
(('raft', 'problems'), 0)
(('zucchini', 'potential'), 0)
(('.', "'"), 0)
(('coached', 'coached'), 0)
(('hypothetical', 'questions'), 0)
(('sex', 'marital'), 0)
(('wrong', 'levels'), 0)
(('Time', 'money'), 0)
(('end', 'space'), 0)
(('fool', 'never'), 0)
(('case', 'buy'), 0)
(('always', 'parts'), 0)
(('better', 'focus'), 0)
(('groundbreaking', 'event'), 0)
(('die', 'cover'), 0)
(('valor', 'valor'), 0.0)
(('shattered', 'knight'), 0)
(('patent', 'lies'), 0)
(('bad', 'company'), 0)
(('soup', 'see'), 0)
(('wise', 'event'), 0)
(('foot', 'bill'), 0)
(('fix', 'anything'), 0)
(("'", 'bottom'), 0)
(('become', 'irrational'), 0)
(('least'

(("'", 'click'), 0)
(('built', 'scale'), 0)
(('bust', "'"), 0)
(('seem', 'fit'), 0)
(('going', 'tangents'), 0)
(('system', 'boot'), 0)
(('real', 'hit'), 0)
(('gets', 'boots'), 0)
(('.', "'"), 0)
(('knew', 'town'), 0)
(('Tom', 'entranced'), 0)
(('wanted', 'answer'), 0)
(('moon', 'world'), 0)
(('bonding', 'experience'), 0)
(('real', 'keeper'), 0)
(("'", 'put'), 0)
(('mind', 'sees'), 0)
(('Kleenex', 'Kleenex'), 0.0)
(('week', 'trial'), 0)
(('men', 'target'), 0)
(("'", 'holiday'), 0)
(('hard', 'swallow'), 0)
(('pick', 'skills'), 0)
(('.', '.'), 0)
(('useful', 'changed'), 0)
(('chickened', 'minute'), 0)
(('"', '"'), 0)
(('Everything', 'small'), 0)
(('cures', 'like'), 0)
(('putting', 'spin'), 0)
(('pushed', 'luck'), 0)
(('hobby', 'took'), 0)
(('wish', 'father'), 0)
(('loved', 'still'), 0)
(('great', 'hitting'), 0)
(('feel', 'run'), 0)
(('bad', 'bargain'), 0)
(('work', 'born'), 0)
(('winner', 'everything'), 0)
(('funeral', 'seen'), 0)
(('vegetate', 'vegetate'), 0)
(('Manila', 'envelope'), 0)


(('summer', 'mathematician'), 0)
(('.', "'"), 0)
(('fiddling', 'around'), 0)
(('succeed', 'try'), 0)
(('quite', 'nicely'), 0)
(('appears', 'spirits'), 0)
(('insecurities', 'insecurities'), 0)
(('blocks', 'home'), 0)
(("'", 'worker'), 0)
(('stalling', 'stalling'), 0.0)
(('ambidextrous', 'ambidextrous'), 0.0)
(('liked', 'policy'), 0)
(('knew', 'sheets'), 0)
(('broken', 'age'), 0)
(('blustered', 'Tom'), 0)
(('Tom', 'refused'), 0)
(('Tom', 'quivering'), 0)
(('leagues', 'hit'), 0)
(('99', 'rounded'), 0)
(('lies', 'upon'), 0)
(("'", 'fault'), 0)
(('goes', 'hitch'), 0)
(('motivations', 'transparent'), 0)
(('remains', 'popular'), 0)
(('lunch', 'short'), 0)
(('die', 'young'), 0)
(('cleaning', 'service'), 0)
(('terrible', 'landings'), 0)
(('clock', 'run'), 0)
(('reception', 'excellent'), 0)
(("'", 'wing'), 0)
(('.', 'smiled'), 0)
(('moonlighting', 'moonlighting'), 0)
(("'", 'ruler'), 0)
(('get', 'erased'), 0)
(('Prevention', 'cure'), 0)
(('pine', 'away'), 0)
(('needling', 'people'), 0)
(('diver'