# Imports

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Training Set

In [67]:
# TRAINING_SET = [
#     [
#         "horse barn fox hen house farm farmhouse chickens goats plants fields grow farmer tractor",
#         "engine tires wheel transmission seats steering wheel garage mileage lights gas ",
#         "bat ball glove pitch base foul pole infield inning outfield home run rbi steal double play score first second third base",
#         "football ball field touchdown foul safety lineman quarterback tackle interception fieldgoal extra point down punt kickoff",
#         "volleyball ball setter hitter middle serve spike pass attack set quick block net ",
#         "democrat republican election vote political race polls constituency ",
#     ],
#     [
#         "Farming",
#         "Auto",
#         "Baseball",
#         "Football",
#         "Volleyball",
#         "Election"
#     ]
# ]

TRAINING_SET = [
    [
        "bat ball glove pitch base foul pole infield inning outfield ",
        "football ball field touchdown ",
        "volleyball ball setter hitter spike",
    ],
    [
        "Baseball",
        "Football",
        "Volleyball",
    ]
]

# Simplified TFIDF Classifier

In [81]:
class TFIDF():
    
    def init(self):
        self.i_to_ans = None
    
    def train(self, training_set):
        docs, answers = training_set
        self.i_to_ans = {i: ans for i, ans in enumerate(answers)}
        vectorizer_kwargs = {
            'ngram_range': (1, 1),
            'min_df': 1,
#             'max_df': .95
        }
        self.tfidf_vectorizer = TfidfVectorizer(**vectorizer_kwargs).fit(docs)
        self.tfidf_matrix = self.tfidf_vectorizer.transform(docs)

    def guess(self, questions, max_n_guesses=2):
        representations = self.tfidf_vectorizer.transform(questions)
        guess_matrix = self.tfidf_matrix.dot(representations.T).T
        guess_indices = (-guess_matrix).toarray().argsort(axis=1)[:, 0:max_n_guesses]
        guesses = []
        for i in range(len(questions)):
            idxs = guess_indices[i]
            guesses.append([(self.i_to_ans[j], guess_matrix[i, j]) for j in idxs])

        return guesses


# Create and Train the Model

In [82]:
model = TFIDF()
model.train(TRAINING_SET)
model.tfidf_vectorizer.vocabulary_

{'bat': 2,
 'ball': 0,
 'glove': 6,
 'pitch': 11,
 'base': 1,
 'foul': 5,
 'pole': 12,
 'infield': 8,
 'inning': 9,
 'outfield': 10,
 'football': 4,
 'field': 3,
 'touchdown': 15,
 'volleyball': 16,
 'setter': 13,
 'hitter': 7,
 'spike': 14}

### What is  `model.tfidf_matrix`?  

It's a matrix with one row for each training document.  Each column represents a word in the vocabulary.  The value for each vocab word is the `text frequency * inverse document frequency` value.

In [83]:
model.tfidf_matrix.shape

(3, 17)

In [84]:
model.tfidf_matrix.todense()

matrix([[0.19316423, 0.32705548, 0.32705548, 0.        , 0.        ,
         0.32705548, 0.32705548, 0.        , 0.32705548, 0.32705548,
         0.32705548, 0.32705548, 0.32705548, 0.        , 0.        ,
         0.        , 0.        ],
        [0.32274454, 0.        , 0.        , 0.54645401, 0.54645401,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.54645401, 0.        ],
        [0.28321692, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.47952794, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.47952794, 0.47952794,
         0.        , 0.47952794]])

# Test the Model

In [85]:
TEST_SET = [
    "the baseball player sat in the outfield with the ball in his glove between innings",
    "the redskins won the game with a field goal in the final minutes"
]
model.guess(TEST_SET)

[[('Baseball', 0.501241453322004), ('Football', 0.1243765894759943)],
 [('Football', 0.546454011634009), ('Baseball', 0.0)]]

### What did it mean to transform the test set?

This just creates a matrix where each row is an input sentence and the columms represent the entire vocabulary

In [87]:
test_set_transformed = model.tfidf_vectorizer.transform(TEST_SET)
print(test_set_transformed.shape)
test_set_transformed.todense()

(2, 17)


matrix([[0.38537163, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.65249088, 0.        , 0.        , 0.        ,
         0.65249088, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ]])

### How do we actually classify the input?

We do a dot product of the training docs (each of which represents a class) with the vectorized input.  Then we can argmax to find the index of the highest score.  

In [97]:
logits = model.tfidf_matrix.dot(test_set_transformed.T).T
logits.todense()

matrix([[0.50124145, 0.12437659, 0.10914377],
        [0.        , 0.54645401, 0.        ]])

Per the original code, let's sort the values in each row to make it easier to read.  This is how one can get the top `n` guesses.

In [98]:
logits.toarray().argsort(axis=1)

array([[2, 1, 0],
       [0, 2, 1]])

Use `np.argmax` to find the indexes of the highest values and then use `model.i_to_ans` to lookup the corresponding class label.

In [123]:
for index in np.argmax(logits, axis=1).squeeze().tolist()[0]:
    print(model.i_to_ans[index])

Baseball
Football
