In [1]:
import pickle
import re
import string
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

In [2]:
df = pickle.load(open('test.pkl', 'rb'))

In [3]:
df.head(20)

Unnamed: 0,Answer,Clue
0,TORY,Whig's rival
1,EDGE,Add a fringe to
2,WOODSY,Suggestive of a forest
3,DOZEN,Egg purchase
4,ILIAD,Tale of Troy
5,VINCE,Coach Lombardi
6,ANGER,Rile up
7,KORAN,Imam's book
8,PAEAN,Song of praise
9,JEDI,"""Star Wars"" warrior"


## Tokenization

In [4]:
x = df.head(20)

In [6]:
[sent.split() for sent in test]

[['DENY', 'Declare', 'untrue'],
 ['JEDI', '"Star', 'Wars"', 'knight'],
 ['KNEE', 'Where', 'pants', 'may', 'be', 'worn'],
 ['SAWN', 'Cut', 'as', 'a', 'log'],
 ['ECOL', 'Tree', "hugger's", 'subj.'],
 ['DOSE', 'Two', 'caplets,', 'say?'],
 ['TYNE', "Newcastle's", 'river'],
 ['FILL', '___', 'in', 'the', 'blank']]

In [108]:
def tokenizer(sent):
    sent = sent.split()
    
    answer = sent[0]
#     print type(answer)
    clue = ' '.join(sent[1:]).encode('utf8')
#     print type(clue)
    
    tokens = [answer]
    if '?' in clue:
        tokens.append('?')
    if '!' in clue:
        tokens.append('!')
    if '_' in clue:
        tokens.append('_')
    if '.' in clue:
        tokens.append('.')
#     if '"' in clue:
#         tokens.append('"')
    if "'" in clue:
        tokens.append("'")
    if ':' in clue:
        tokens.append(':')
        
    ## More than 1 capitalized letters that start a word? Proper noun
    if len(re.findall('[A-Z]', clue)) > 1:
        tokens.append('_CAP')
        
    quotes = re.findall('\".+\"', clue)
    if quotes:
        tokens.extend(quotes)
        
    possessive = re.findall("\w+'s", clue)
    if possessive:
        tokens.extend(possessive)
    
#     print sent
#     print tokens
    
    stemmer = SnowballStemmer('english')
#     print type(clue)
    words = clue.translate(string.maketrans("",""), string.punctuation).lower().split()
#     print words
#     print [stemmer.stem(w) for w in words]
    tokens.extend([stemmer.stem(w).encode('utf8') for w in words])
#     print tokens, '\n'
    return tokens

## Vectorizer

#### With a small test set

In [87]:
small_test = pd.DataFrame({'Combined':['DENY Declare untrue',
            'JEDI "Star Wars" knight',
            'KNEE Where pants may be worn',
            'SAWN Cut as a log',
            "ECOL Tree hugger's subj.",
            'DOSE Two caplets, say?',
            "TYNE Newcastle's river",
            'FILL ___ in the blank']})

# test = ['Declare untrue',
#         '"Star Wars" knight',
#         'Where pants may be worn',
#         'Cut as a log',
#         "Tree hugger's subj.",
#         'Two caplets, say?',
#         "Newcastle's river",
#         '___ in the blank',
#         'USA USA USA']

In [88]:
small_test

Unnamed: 0,Combined
0,DENY Declare untrue
1,"JEDI ""Star Wars"" knight"
2,KNEE Where pants may be worn
3,SAWN Cut as a log
4,ECOL Tree hugger's subj.
5,"DOSE Two caplets, say?"
6,TYNE Newcastle's river
7,FILL ___ in the blank


In [99]:
for sent in small_test:
    print tokenizer(sent)

['Combined']


In [98]:
vectorizer = CountVectorizer(tokenizer=tokenizer)
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizer at 0x10e2f5848>, vocabulary=None)

In [100]:
small_array = vectorizer.fit_transform(small_test.Combined).toarray()
print 'Small set vectorized:\n', small_array

Small set vectorized:
[[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 1 1]
 [0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
  0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
  0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1
  0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0]]


In [101]:
bag_of_words = pd.DataFrame(small_array, columns = vectorizer.get_feature_names())
bag_of_words

Unnamed: 0,"""","""star wars""",',.,?,_,a,as,be,blank,...,star,subj,the,tree,two,tyne,untru,war,where,worn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [105]:
target = ['Two "Star Wars" where war was fought and cut river blank?']

target_array = vectorizer.transform(target).toarray()
print 'Target set vectorized:\n', target_array

cosine = cosine_similarity(target_array, small_array)
cosine_column = pd.DataFrame(cosine.tolist()[0], columns = ['cosine'])

result = pd.concat([small_test, bag_of_words, cosine_column], axis = 1)
result.sort_values(by='cosine', ascending=False)

Target set vectorized:
[[1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0
  0 2 1 0]]


Unnamed: 0,Combined,"""","""star wars""",',.,?,_,a,as,be,...,subj,the,tree,two,tyne,untru,war,where,worn,cosine
1,"JEDI ""Star Wars"" knight",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.566139
5,"DOSE Two caplets, say?",0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0.248069
3,SAWN Cut as a log,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0.124035
6,TYNE Newcastle's river,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.124035
7,FILL ___ in the blank,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0.124035
2,KNEE Where pants may be worn,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0.113228
0,DENY Declare untrue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.0
4,ECOL Tree hugger's subj.,0,0,1,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0.0


### With test data frame

In [97]:
df['Combined'] = df[['Answer', 'Clue']].apply(lambda x: ' '.join(x), axis = 1)
df

Unnamed: 0,Answer,Clue,Combined
0,TORY,Whig's rival,TORY Whig's rival
1,EDGE,Add a fringe to,EDGE Add a fringe to
2,WOODSY,Suggestive of a forest,WOODSY Suggestive of a forest
3,DOZEN,Egg purchase,DOZEN Egg purchase
4,ILIAD,Tale of Troy,ILIAD Tale of Troy
5,VINCE,Coach Lombardi,VINCE Coach Lombardi
6,ANGER,Rile up,ANGER Rile up
7,KORAN,Imam's book,KORAN Imam's book
8,PAEAN,Song of praise,PAEAN Song of praise
9,JEDI,"""Star Wars"" warrior","JEDI ""Star Wars"" warrior"


In [109]:
vectorizer2 = CountVectorizer(tokenizer=tokenizer)

test_array = vectorizer2.fit_transform(df.Combined).toarray()
# print 'Small set vectorized:\n', test_array

bag_of_words = pd.DataFrame(test_array, columns = vectorizer2.get_feature_names())
bag_of_words

Small set vectorized:
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 1 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


Unnamed: 0,!,"""___ #1!""","""___ irish rose""","""___ of the d'urbervilles""","""chocolat""","""star wars""","""vacancy""",',.,1,...,whig,whig's,whistl,wine,wipe,with,womb,woodsy,work,worn
0,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
target2 = ['Two "Star Wars" where war was fought and cut river blank?']

target2_array = vectorizer2.transform(target2).toarray()
print 'Target set vectorized:\n', target2_array

cosine = cosine_similarity(target2_array, test_array)
cosine_column = pd.DataFrame(cosine.tolist()[0], columns = ['cosine'])

result = pd.concat([df, bag_of_words, cosine_column], axis = 1)
result.sort_values(by='cosine', ascending=False)

Target set vectorized:
[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0
  0 0 0 0 0]]


Unnamed: 0,Answer,Clue,Combined,!,"""___ #1!""","""___ irish rose""","""___ of the d'urbervilles""","""chocolat""","""star wars""","""vacancy""",...,whig's,whistl,wine,wipe,with,womb,woodsy,work,worn,cosine
9,JEDI,"""Star Wars"" warrior","JEDI ""Star Wars"" warrior",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.565685
62,DAM,River regulator,DAM River regulator,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.182574
96,DIVA,Met star,DIVA Met star,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.182574
15,DOSE,"Two caplets, say","DOSE Two caplets, say",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.158114
11,SAWN,"Cut, as a log","SAWN Cut, as a log",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.141421
44,EBRO,Saragossa's river,EBRO Saragossa's river,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.141421
12,TYNE,Newcastle's river,TYNE Newcastle's river,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.141421
14,KNEE,Where pants may be worn,KNEE Where pants may be worn,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.129099
68,COW,It's no bull,COW It's no bull,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.000000
64,UNLATCHES,"Opens, as a gate","UNLATCHES Opens, as a gate",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.000000
