## Imports and Initialisations

In [18]:
import nltk
nltk.download('stopwords')

from nltk.corpus import brown, stopwords
import xml.etree.ElementTree as ET
import string

sents = brown.sents()
stopwords = set(stopwords.words('english'))
tree = ET.parse('../data/test/subtask1-homographic-test.xml')
puncts = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sounak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
F = open("../data/test/subtask1-homographic-test.gold")
scores = list()
for item in F.readlines():
    ans = item.split()[1]
    scores.append(ans)

In [20]:
test_sents = list()
root = tree.getroot()
for text in root.getchildren():
    sent = list()
    for word in text.getchildren():
        sent.append(word.text)
    test_sents.append(sent)

In [78]:
def filter_sent(sent):
    sent = [w for w in sent if w.lower() not in stopwords]
    sent = [w for w in sent if w[0] not in puncts]
    return sent

def make_ordered_pairs(sent):
    ans = list()
    for index, word in enumerate(sent):
        if index == len(sent) - 1:
            break
        temp = index + 1
        while(1):
            ans.append((sent[index], sent[temp]))
            if temp == len(sent) - 1:
                break
            temp += 1
    return ans

In [95]:
from collections import Counter

words = Counter([])
bigrams = Counter([])

for sent in test_sents:
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    words.update(sent)
    bigrams.update(pairs)
    
print('\t', words.most_common(10))
print('\t', bigrams.most_common(10))

	 [('never', 230), ('die', 201), ('OLD', 178), ('get', 101), ('good', 66), ('always', 62), ('name', 57), ('said', 57), ('Tom', 56), ('got', 51)]
	 [(('never', 'die'), 194), (('OLD', 'die'), 178), (('OLD', 'never'), 177), (('never', 'lose'), 40), (('die', 'lose'), 40), (('OLD', 'lose'), 38), (('never', 'get'), 36), (('die', 'get'), 34), (('OLD', 'get'), 33), (('Doctor', 'Next'), 19)]


## Calculating PMI

The PMI (Pointwise Mututal Information) is being calculated as:
```example
PMI(w1, w2) = log (P(w1, w2) / (P(w1) * P(w2)))
```

In [96]:
import math

def PMI(tup):
    try:
        val = bigrams[tup] / (words[tup[0]] * words[tup[1]])
        return math.log(val)
    except:
        return 0

## Calculating Threshold

The threshold for the difference in the highest PMI and is calculated using the Interquartile Range (IQR). IQR is preferred because it is able to eliminate outliers.

In [97]:
import numpy as np
import scipy.stats as sp

IQRs = list()
for sent in test_sents:
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([PMI(_) for _ in pairs])
    l = len(PMIs)
    if l > 0:
        IQRs.append(sp.iqr(PMIs))

IQRs = sorted(IQRs)
threshold = np.median(IQRs)
print('\t', threshold)

	 1.7107470400723912


## Testing Sentences for Pun

In [98]:
scores_test = list()
for i, sent in enumerate(test_sents):
    sent_ = sent
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([(_, PMI(_)) for _ in pairs], key=lambda k : k[1])
    l = len(PMIs)
    if l > 0:
        if sp.iqr([v for k, v in PMIs]) > threshold:
            print('\t', 1, scores[i])
        else:
            print('\t', 0, scores[i])
            scores_test.append(0)
    else:
        print('\t', 0, scores[i])
        scores_test.append(0)

	 1 1
	 0 1
	 0 1
	 0 1
	 1 1
	 0 0
	 1 1
	 1 1
	 1 1
	 0 1
	 1 1
	 0 0
	 0 0
	 0 1
	 0 1
	 0 1
	 1 0
	 0 1
	 0 1
	 1 1
	 1 1
	 1 1
	 1 1
	 0 1
	 0 1
	 1 1
	 1 1
	 1 1
	 0 1
	 1 1
	 0 1
	 0 0
	 0 1
	 1 1
	 0 0
	 0 0
	 0 0
	 1 1
	 1 1
	 0 0
	 1 1
	 1 0
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 0 1
	 0 0
	 1 1
	 0 1
	 1 1
	 0 1
	 0 0
	 1 1
	 0 0
	 0 0
	 1 1
	 0 0
	 1 1
	 0 1
	 0 1
	 0 1
	 0 0
	 0 0
	 1 0
	 1 1
	 1 0
	 1 1
	 0 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 1 1
	 1 0
	 1 1
	 0 1
	 0 1
	 1 1
	 0 0
	 1 1
	 1 1
	 0 1
	 1 1
	 1 0
	 1 1
	 0 1
	 0 1
	 1 1
	 0 0
	 0 0
	 1 1
	 1 1
	 1 1
	 1 1
	 1 1
	 1 0
	 1 0
	 1 1
	 1 1
	 0 0
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 0
	 1 1
	 0 1
	 0 0
	 0 1
	 1 1
	 0 1
	 1 0
	 1 1
	 1 1
	 1 0
	 0 0
	 0 0
	 0 1
	 0 1
	 0 1
	 1 1
	 1 1
	 1 0
	 1 1
	 0 0
	 1 1
	 1 1
	 1 0
	 0 1
	 0 1
	 0 1
	 1 1
	 0 1
	 0 1
	 1 1
	 1 1
	 1 0
	 0 1
	 1 1
	 1 0
	 0 1
	 1 1
	 0 1
	 0 1
	 1 1
	 1 1
	 0 1
	 0 1
	 1 0
	 0 1
	 0 1
	 0 0
	 1 1
	 0 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 0 0
	 0 

	 0 0
	 1 1
	 0 0
	 0 1
	 1 1
	 0 1
	 1 1
	 0 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 0 1
	 1 1
	 0 1
	 0 0
	 0 1
	 0 0
	 0 1
	 1 1
	 0 0
	 0 0
	 0 1
	 0 1
	 0 0
	 1 0
	 0 0
	 1 1
	 1 1
	 1 0
	 0 0
	 0 0
	 0 0
	 1 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 0 1
	 0 1
	 0 1
	 0 1
	 0 0
	 0 1
	 1 0
	 1 0
	 1 0
	 0 0
	 0 0
	 1 1
	 1 0
	 0 1
	 1 1
	 0 0
	 1 1
	 1 1
	 0 1
	 1 1
	 1 0
	 0 0
	 1 0
	 0 1
	 0 1
	 1 1
	 0 1
	 1 1
	 0 0
	 0 1
	 1 1
	 0 1
	 0 0
	 0 1
	 0 1
	 1 1
	 0 1
	 1 1
	 0 1
	 1 0
	 0 1
	 1 1
	 0 0
	 0 0
	 0 1
	 0 1
	 1 1
	 1 1
	 1 0
	 0 0
	 0 0
	 0 1
	 0 0
	 1 0
	 0 1
	 1 1
	 0 1
	 0 1
	 0 0
	 0 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 0
	 0 1
	 1 0
	 1 1
	 0 1
	 0 0
	 1 0
	 0 0
	 1 1
	 0 1
	 1 0
	 0 0
	 0 0
	 1 1
	 1 1
	 1 1
	 1 1
	 1 0
	 0 1
	 1 0
	 1 1
	 0 0
	 1 0
	 0 1
	 1 1
	 1 1
	 0 0
	 1 1
	 1 1
	 1 1
	 1 1
	 0 0
	 0 1
	 0 1
	 1 1
	 1 1
	 1 1
	 1 1
	 0 1
	 1 1
	 1 1
	 1 1
	 0 1
	 0 0
	 1 0
	 1 1
	 1 1
	 0 1
	 0 0
	 0 1
	 1 1
	 0 1
	 0 1
	 1 1
	 1 