## Imports and Initialisations

In [1]:
import nltk
nltk.download('stopwords')

from nltk.corpus import brown, stopwords
import xml.etree.ElementTree as ET
import string

sents = brown.sents()
stopwords = set(stopwords.words('english'))
tree = ET.parse('../data/test/subtask1-homographic-test.xml')
puncts = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/khannatanmai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
F = open("../data/test/subtask1-homographic-test.gold")
scores = list()
for item in F.readlines():
    ans = item.split()[1]
    scores.append(ans)

In [3]:
test_sents = list()
root = tree.getroot()
for text in root.getchildren():
    sent = list()
    for word in text.getchildren():
        sent.append(word.text)
    test_sents.append(sent)

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [4]:
def filter_sent(sent):
    sent = [w for w in sent if w.lower() not in stopwords]
    sent = [w for w in sent if w[0] not in puncts]
    return sent

def make_ordered_pairs(sent):
    ans = list()
    for index, word in enumerate(sent):
        if index == len(sent) - 1:
            break
        temp = index + 1
        while(1):
            ans.append((sent[index], sent[temp]))
            if temp == len(sent) - 1:
                break
            temp += 1
    return ans

In [5]:
from collections import Counter

words = Counter([])
bigrams = Counter([])

for sent in test_sents:
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    words.update(sent)
    bigrams.update(pairs)
    
print(words.most_common(10))
print(bigrams.most_common(10))

[('never', 230), ('die', 201), ('OLD', 178), ('get', 101), ('good', 66), ('always', 62), ('name', 57), ('said', 57), ('Tom', 56), ('got', 51)]
[(('never', 'die'), 194), (('OLD', 'die'), 178), (('OLD', 'never'), 177), (('never', 'lose'), 40), (('die', 'lose'), 40), (('OLD', 'lose'), 38), (('never', 'get'), 36), (('die', 'get'), 34), (('OLD', 'get'), 33), (('Doctor', 'Next'), 19)]


## Calculating PMI

The PMI (Pointwise Mututal Information) is being calculated as:
```
PMI(w1, w2) = log (P(w1, w2) / (P(w1) * P(w2)))
```

In [6]:
import math

def PMI(tup):
    try:
        val = bigrams[tup] / (words[tup[0]] * words[tup[1]])
        return math.log(val)
    except:
        return 0

## Calculating Threshold

The threshold for the difference in the highest PMI and is calculated using the Interquartile Range (IQR). IQR is preferred because it is able to eliminate outliers.

In [7]:
import numpy as np
import scipy.stats as sp

IQRs = list()
for sent in test_sents:
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([PMI(_) for _ in pairs])
    l = len(PMIs)
    if l > 0:
        IQRs.append(sp.iqr(PMIs))

IQRs = sorted(IQRs)
threshold = np.median(IQRs)
print(threshold)

1.7107470400723912


## Testing Sentences for Pun

- Converting each sentence into tokens
- Stopword Removal
- Generating word pairs preserving word order
- Calculating PMI score and checking whether above threshold
- Checking whether any word from pair has multiple sense

In [8]:
from nltk.corpus import wordnet as wn

def has_multiple_sense(word):
    syns = Counter([_.name().split('.')[0] for _ in wn.synsets(word)])
    return syns[word] > 1

test_scores = list()
for i, sent in enumerate(test_sents):
    sent_ = sent
    sent = filter_sent(sent)
    pairs = make_ordered_pairs(sent)
    PMIs = sorted([(_, PMI(_)) for _ in pairs], key=lambda k : k[1])
    l = len(PMIs)
    if l > 0:
        if sp.iqr([v for k, v in PMIs]) > threshold:
            if has_multiple_sense(PMIs[-1][0][0]) or has_multiple_sense(PMIs[-1][0][1]):
                test_scores.append('1')
            else:
                test_scores.append('0')
        else:
            test_scores.append('0')
    else:
        test_scores.append('0')
        
print(len(test_scores))

2250


## Calculating Precision, Recall, F1-Score

```
Precision = TP/TP+FP
Recall = TP/TP+FN
F1-Score = 2*(Recall*Precision)/(Recall+Precision)
```

In [9]:
t_scores = Counter([(scores[i], test_scores[i]) for i in range(len(scores))])

TP = t_scores[('1', '1')]
FP = t_scores[('0', '1')]
FN = t_scores[('1', '0')]

pre = TP / (TP + FP)
re = TP / (TP + FN)
f1 = 2 * (re * pre) / (re + pre)

print('precision', pre)
print('recall', re)
print('F1-score', f1)

precision 0.8181818181818182
recall 0.35283136278780336
F1-score 0.4930434782608695


## Manually Checking Failed Cases

In [17]:
fail_cases = []

for i in range(len(scores)):
    if(scores[i] == '1' and test_scores[i] == '0'):
        fail_cases.append(i)

In [19]:
fail_cases

[0,
 1,
 2,
 3,
 6,
 9,
 13,
 14,
 15,
 17,
 18,
 23,
 24,
 26,
 28,
 29,
 30,
 32,
 33,
 37,
 40,
 42,
 43,
 44,
 46,
 47,
 50,
 52,
 57,
 60,
 61,
 62,
 69,
 71,
 72,
 74,
 75,
 79,
 80,
 81,
 82,
 85,
 86,
 87,
 90,
 91,
 92,
 96,
 97,
 98,
 102,
 105,
 107,
 112,
 114,
 116,
 118,
 123,
 124,
 125,
 126,
 129,
 132,
 134,
 135,
 136,
 138,
 139,
 143,
 144,
 146,
 148,
 149,
 150,
 152,
 153,
 155,
 156,
 158,
 159,
 161,
 163,
 164,
 166,
 168,
 171,
 172,
 173,
 174,
 176,
 177,
 178,
 180,
 182,
 185,
 187,
 188,
 190,
 194,
 195,
 196,
 197,
 198,
 202,
 203,
 205,
 206,
 207,
 213,
 216,
 217,
 218,
 220,
 221,
 222,
 223,
 226,
 227,
 230,
 232,
 233,
 235,
 236,
 237,
 239,
 244,
 245,
 246,
 249,
 253,
 257,
 260,
 261,
 264,
 266,
 267,
 268,
 271,
 272,
 276,
 283,
 285,
 287,
 293,
 297,
 299,
 300,
 302,
 303,
 314,
 319,
 321,
 328,
 333,
 337,
 338,
 340,
 343,
 344,
 345,
 347,
 348,
 349,
 352,
 353,
 354,
 355,
 357,
 358,
 359,
 360,
 361,
 365,
 367,
 368,
 370,


In [25]:
for i in fail_cases[0:10]:
    for j in test_sents[i]:
        print(j, end = ' ')
    print('')

They hid from the gunman in a sauna where they could sweat it out . 
Wal - Mart isn ' t the only saving place ! 
Can honeybee abuse lead to a sting operation ? 
A ditch digger was entrenched in his career . 
Did you hear about the new pinata ? It ' s a huge hit . 
She was suspected of stealing a brooch but they couldn ' t pin it on her . 
' ' There ' s room for one more , ' ' Tom admitted . 
They threw a party for the inventor of the toaster . And he was toasted . 
If you ' re a gardener you might call yourself a ' plant manager ' . 
My advanced geometry class is full of squares . 
