# Shaku
**A demonstration of the goal distance and measurement framework in Kyudo**

In [105]:
# Imports for package management
import os
import json

from collections import Counter
from nltk import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

In [26]:
# Module constants and fixtures
FIXTURES    = os.path.join(os.getcwd(), 'fixtures')
CORPUS_PATH = os.path.join(FIXTURES, 'questions.corpus.json')

# Tasks Constants
WHO    = 'who'
WHAT   = 'WHAT'
WHEN   = 'WHEN'
WHERE  = 'WHERE'
WHY    = 'WHY'
HOW    = 'HOW'
EXIST  = 'EXISTENTIAL'
PERMIT = 'PERMISSION'
UNKNWN = 'UNKNOWN'

# Task Mapping
TASK_MAP    = {
    'who':    WHO, 
    'what':   WHAT,
    'when':   WHEN,
    'where':  WHERE,
    'why':    WHY,
    'how':    HOW,
    'which':  WHAT,
    'in':     WHAT,
    'are':    EXIST,
    'on':     WHAT,
    'can':    PERMIT,
    'does':   EXIST,
    'is':     EXIST,
    'at':     WHERE,
    'for':    WHAT,
    'with':   WHAT,
    'did':    EXIST,
    'whats':  WHAT,
    'should': PERMIT
}

In [70]:
def read_data(corpus=CORPUS_PATH):
    """
    Reads and parses corpus data and yields each item at a time.
    """
    with open(corpus, 'r') as f:
        data = json.load(f)
        for item in data:
            yield item

In [114]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Preprocesses text to ensure that it is lower cased.
    """

    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        return [
            text.strip().lower()
            for text in texts
        ]


class TaskVectorizer(BaseEstimator, TransformerMixin):
    """
    Provide the task feature from an utterance.
    """
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, utterances):
        return [
            self.extract_task(utterance) 
            for utterance in utterances
        ]
    
    def extract_task(self, utterance):
        tokens = wordpunct_tokenize(utterance)
        first  = tokens[0]
        if first in TASK_MAP:
            return {'task': TASK_MAP[first]}
        
        return {'task': UNKNWN}

## Construct the Feature Pipeline
feats = Pipeline([
    # Preprocess text to make sure it is normalized.
    ('preprocess', TextPreprocessor()),
        
    # Use FeatureUnion to combine concept, task, and context features 
    ('union', FeatureUnion(
        
        # Create union of TF-IDF and Task Vectorizers
        transformer_list=[
            
            # Pipeline for Concept Extraction
            ('concepts', Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=wordpunct_tokenize)),
                ('best', TruncatedSVD(n_components=50)),
            ])),
                    
            # Pipeline for Task Extraction
            ('tasks', Pipeline([
                ('tasks', TaskVectorizer()),
                ('vect', DictVectorizer()),
            ])),
            
                    
        ],

        # weight components in FeatureUnion
        transformer_weights={
            'concepts': 0.45,
            'tasks': 0.55,
        },
    )),
])

In [115]:
feats.fit(read_data())

Pipeline(steps=[('preprocess', TextPreprocessor()), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('concepts', Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowerca...ort=True,
        sparse=True))]))],
       transformer_weights={'tasks': 0.55, 'concepts': 0.45}))])

In [117]:
# Print feature representation
print feats.transform(('where is the food truck?',))

  (0, 0)	0.0852185900015
  (0, 1)	0.0315209099528
  (0, 2)	0.0568007038668
  (0, 3)	-0.0361139208257
  (0, 4)	-0.067512677812
  (0, 5)	0.0134246498796
  (0, 6)	0.0321236105732
  (0, 7)	-0.0250677447214
  (0, 8)	0.00097913443713
  (0, 9)	0.00561780033721
  (0, 10)	-0.00736342322186
  (0, 11)	-0.0165369384268
  (0, 12)	-0.00172181491667
  (0, 13)	0.000891893120333
  (0, 14)	0.0198861130849
  (0, 15)	-0.00829117992882
  (0, 16)	0.00200409416144
  (0, 17)	0.00400897570371
  (0, 18)	0.00522655472095
  (0, 19)	-0.0138163001452
  (0, 20)	-0.00253459183025
  (0, 21)	0.00602975391631
  (0, 22)	0.00310560120697
  (0, 23)	0.000463039075122
  (0, 24)	-0.00890815484898
  :	:
  (0, 26)	-0.0135424346071
  (0, 27)	0.00719519199843
  (0, 28)	-0.00435666999392
  (0, 29)	-0.00457773381357
  (0, 30)	-0.000897927855733
  (0, 31)	-0.012106371337
  (0, 32)	0.00679734992166
  (0, 33)	-0.00162369163835
  (0, 34)	0.00906422418398
  (0, 35)	-0.00423696426473
  (0, 36)	0.00564358846507
  (0, 37)	-0.00340856070295

In [119]:
# Build kNearestNeighbors clustering mechanism
nbrs = NearestNeighbors(n_neighbors=5).fit(feats.fit_transform(read_data()))

In [127]:
def find_related_questions(question):
    features = feats.transform((question,))
    distances, indices = nbrs.kneighbors(features)
    for idx, text in enumerate(read_data()):
        if idx in indices:
            print text

In [128]:
find_related_questions('What are bizarre sports that people play in Washington DC?')

what are bizarre events that happen in washington dc?
what are bizarre things that people eat in washington dc?
what are bizarre sports that people play in washington dc?
what religion are most people in pakistan?
what are portuguese people considered?
