In [5]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)


## Receipt.ID
### Hierarchical item classification
Taxonomic classification, categorize items according to a pre-defined taxonomy. The goal is to assign one or more categories in the taxonomy to an item. It is a multi-class **and** multi-label classification problem with hierarchical relationships between each node in the tree.

#### Items
- Items come from a wide range for categories like Produce, Meat, Beverage, Supplies. 
- Example item to category mapping:


|item|mapping|
|---|---|
|Kale  | "Food/Produce/Kale"  |
|Vinegar white wine 50 grain  | "Food/Dry-Grocery/Vinegars/White Wine Vinegar"  |
|Imported nat flank steak  | "Food/Meats/Beef/Flank Steak"  |


To solve this problem, I will undertake the following course of action:
1. Explore the dataset
    - Explore the dataset to ensure its integrity and understand the context. 
2. Identify features that may be used. 
    - If possible, engineer features that might provide greater discrimination.
3. Build **k** independent *text-based* classifiers for the text-based features and feed the output from these classifiers into the next layer classifier which takes in the other features. This approach combines information from both the text-based labels as well as the item’s metadata. Explore a couple of classifiers that might be well suited for the problem at hand.
    - RandomForest
    - DecisionTree
    - SVC
    - AdaBoost       

4.  Select appropriate classifier based on evaluation metric and tune it for optimality.

In this notebook I do processes 3 and 4.




In [7]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('tools/')

import numpy as np
import pandas as pd
import json

# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("white")  

# Use CPickle if available
try:
   import cPickle as pickle
except:
   import pickle

In [9]:
dataPath = '/Users/omojumiller/mycode/insight/PlateIQ/'
df = pd.read_pickle(dataPath+'data/df_data_vectors.dat')
df_catergory_lookup = pd.read_pickle(dataPath+'data/data_category_lookup.dat')
print ('Read in a data file with {0} datapoints'.format(len(df)))

Read in a data file with 127108 datapoints


In [10]:
# Define helper functions
from sklearn.model_selection import train_test_split

def shuffle_split_data(X, y):
    """ Shuffles and splits data into 75% training and 25% testing subsets,
        then returns the training and testing subsets. """
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=num_train, random_state=42)
    
    # Return the training and testing data subsets
    return X_train, y_train, X_test, y_test

In [11]:
class MeanEmbeddingVectorizer(object):
    """Transforms lists of feature-value mappings to vectors.
    
    Attributes
    ----------
    object : word2vec model 
    """
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.itervalues().next())
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    

In [12]:
def get_categories(level, num_data = 1000):
    """Gets categories which at least num_data datapoints.
    
    Attributes
    ----------
    level : column name
    num_data : int
    """    
    the_categories = {}
    counts = df.groupby(level).size()
    total = df.groupby(level).size().sum()
    
    for i in counts.index:
        if counts[i] > num_data:
            the_categories[int(i)] = counts[i]


    print(level, 'has', len(the_categories), 'categories with enough data.')  

    for key in the_categories:
        find_name = df_catergory_lookup.category_id == key
        name = df_catergory_lookup[find_name].category_name
        the_index = name.index[0]

    return the_categories, total




## Current approach
The current approach used focuses exclusively on the item's name, for example, a data point in the dataset would have an object's name as "mary's organic fryers" or "organic baby spinach." The first challenge with this approach lay in the fact that the item label was quite short, roughly about two to eight words.  Further, when modifiers like *gluten free*, *organic*, or *pesticide free* wherein the item's label, this added a layer of misinformation causing items like *organic milk* and *organic beer* to be classified in the same class.

## Samples
I have selected these samples to see that I can correctly separate them even though they have the modifiers in their item name


In [13]:
indices = [274, 64391, 55697] 

# Create a DataFrame of the chosen samples
samples = pd.DataFrame(df.loc[indices], columns = df.keys())
print ("Chosen samples of items:")
display(samples)

Chosen samples of items:


Unnamed: 0,mapped_category,item_id,price_stddev,primary_unit,price_mean,category,vendor_id,item_name,level_0,level_1,...,catg_986,catg_989,catg_998,catg_999,catg_1002,catg_495,catg_499,catg_1015,catg_1019,catg_1021
274,"[Food, Produce, Spinach]",1616,0.539695,1,12.6064,"[1, 9, 91]",14,organic baby spinach,1,9.0,...,0.258241,0.146029,0.277557,0.010304,0.280883,0.225076,0.314643,,,
64391,"[Beverages, Alcoholic, Beers]",508958,,4,165.0,"[3, 6, 16]",248,eel river organic 15.5,3,6.0,...,0.245825,,,,,0.248294,0.338635,0.339625,0.324124,0.281059
55697,"[Food, Meats, Beef, Beef Tongue]",455407,0.0151186,3,2.98429,"[1, 7, 70, 1675]",4218,mary's organic fryers,1,7.0,...,0.334703,0.174655,0.284945,0.249411,0.222272,0.27244,0.318668,,,


## My Approach
I took an entirely different approach. I got inspiration from the approach that Google, [YouTube](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36411.pdf) used in organizing videos and decided to shift the unit of analysis from item *name* to the *categories* themselves. My approach combines information from both the text-based labels as well as the item's metadata.

This method achieves two crucial things. First, by focusing on individual categories, each time a new category of item is added to the restaurant domain, instead of having to retrain the classifier on the entire dataset, all we have to do is gather enough data for that category, and train a classifier for it. This way, the approach can scale beautifully as the taxonomy grows. Second, moving the unit of analysis from text labels to categories, it becomes easier to correctly separate "organic cream" and "organic beer." 

I chose to discard categories that had less than 300 datapoints. As the datapoints in the category grows, those categories can then been trained individually.

In [14]:
the_level = {}
num_data = 300

level = 'level_0'
the_level[level], total = get_categories(level, num_data)

level_0 has 6 categories with enough data.


In [15]:
level = 'level_1'
the_level[level], total = get_categories(level, num_data)

level_1 has 19 categories with enough data.


In [16]:
level = 'level_2'
df.loc[:,('label')] = df.loc[:, level]
the_level[level], total = get_categories(level, num_data)

level_2 has 65 categories with enough data.


In [17]:
level = 'level_3'
df.loc[:,('label')] = df.loc[:, level]
the_level[level], total = get_categories(level, num_data)

level_3 has 42 categories with enough data.


In [18]:
level = 'level_4'
df.loc[:,('label')] = df.loc[:, level]
the_level[level], total = get_categories(level, num_data)

level_4 has 9 categories with enough data.


In [19]:
k_categories = 0
for key in the_level.keys():
    k_categories += len(the_level[key].keys())
   
    
print('processed {} categories in all'.format(k_categories))

processed 141 categories in all


### Labeling
For each data point, I assigned the deepest class node in the tree to which it belonged as its label. For example, an item named "bacon ends" belonged to both classes [Food], [Food, Meats] and [Food, Meats, Pork]. For such an item, I assigned it is label as "Pork."

In [20]:
null_examples = df['label'].isnull()
df.ix[df[null_examples].index, 'label'] = df.loc[:, 'level_4']

In [21]:
null_examples = df['label'].isnull()
df.ix[df[null_examples].index, 'label'] = df.loc[:, 'level_3']

In [22]:
null_examples = df['label'].isnull()
df.ix[df[null_examples].index, 'label'] = df.loc[:, 'level_2']

In [23]:
null_examples = df['label'].isnull()
df.ix[df[null_examples].index, 'label'] = df.loc[:, 'level_1']

In [24]:
null_examples_ = df['label'].isnull()
df.ix[df[null_examples_].index, 'label'] = df.loc[:, 'level_0']

### Retrieve the *k* training sets for the *k* categories for each level in the tree   
The aim of moving to a category-based solution is to embed knowledge of the taxonomy into classifiers. To do this, I had to figure out how to get positive and negative samples for each category. For every category node, I decided that itself, as well as all its descendants, were *positive* samples for that class. All other nodes that were neither the categories ancestor(s) or itself were set as *negative* samples.  The figure below gives a visual explanation of selecting category training set. I did this for each category node in the tree that had enough data.
<img src="images/hc_5.png" alt="Drawing" style="width: 450px;"/>

One of the limitations of this method is that most nodes have un-balanced classes. Some more severe than other, especially as you go further down in the tree. There are some classes whose ratio of positive signals is as small as 0.02%. The more granular that subclasses get, the harder it is to classify them. The good news is that one can decide to focus on a few of these classes and use synthetic methods to rebalance the dataset.

In [26]:
# Create a dict of positive and negative samples for the categories
category_samples = {}
category_positive_samples = {}
category_negative_samples= {}
category_test_samples = {}

In [27]:
level = 'level_0'
for key in the_level[level].keys():
    find_name = df_catergory_lookup.category_id == key
    name = df_catergory_lookup[find_name].category_name
    the_index = name.index[0]
    

    print ("processing {0}: {1}".format(key, name.ix[the_index]))
    # create POSITIVE training set
    
    key_df = df[level] == key
    null_examples = df['level_1'].isnull()
    positive_examples = df['level_1'].notnull()
    

    # Create training set 
    try:
        category_samples[key] = df[key_df & null_examples]
        category_samples[key] = category_samples[key].append(df[key_df & positive_examples])
        num_test = (len(category_samples[key]) // 10) * 2
        category_positive_samples[key] = category_samples[key][num_test:]
        category_test_samples[key] = category_samples[key][:num_test]        

    except:
        print("This category didn't generate any feature set {0}".format(key))
        continue
        
    # create NEGATIVE training set
    level_0_item = df['level_0'] != key
    
    # Create training set 
    try:
        category_negative_samples[key] = df[level_0_item]
        
    except:
        print("This category didn't generate any label set {0}".format(key))
        continue
        
    
    
print("Finished Processing")




processing 1: Food
processing 2: Supplies
processing 3: Beverages
processing 4: Other
processing 495: Grocery
processing 499: Protein
Finished Processing


In [28]:
level = 'level_2'
for key in the_level[level].keys():
    find_name = df_catergory_lookup.category_id == key
    name = df_catergory_lookup[find_name].category_name
    the_index = name.index[0]

    print ("processing {0}: {1}".format(key, name.ix[the_index]))
    # create POSITIVE training set
    
    key_df = df[level] == key
    null_examples = df['level_3'].isnull()
    positive_examples = df['level_3'].notnull()
    

    # Create training set 
    try:
        category_samples[key] = df[key_df & null_examples]
        category_samples[key] = category_samples[key].append(df[key_df & positive_examples])
        num_test = (len(category_samples[key]) // 10) * 2
        category_positive_samples[key] = category_samples[key][num_test:]
        category_test_samples[key] = category_samples[key][:num_test]        
    
    except:
        print("This category didn't generate any feature set {0}".format(key))
        continue
        
    # create NEGATIVE training set
    find_level_0 = df[level] == key
    name_level_0 = df[find_level_0].level_0
    the_index_level_0 = name_level_0.index[0]
    
    find_level_1 = df[level] == key
    name_level_1 = df[find_level_1].level_1
    the_index_level_1 = name_level_1.index[0]
    
    level_0_item = df['level_0'] != name_level_0.ix[the_index_level_0]
    level_1_item = df['level_1'] != name_level_1.ix[the_index_level_1]
    level_2_item = df['level_2'] != key
    
    # Create training set 
    try:
        category_negative_samples[key] = df[level_0_item & level_1_item & level_2_item]
        
    except:
        print("This category didn't generate any label set {0}".format(key))
        continue    
    
print("Finished Processing")



processing 1281: SF Checkout Bag Fee
processing 130: Cream
processing 375: Sausages
processing 132: Tomatoes
processing 134: Peppers
processing 391: Salt
processing 392: Disposables & Packaging Supplies
processing 267: Glasses
processing 258: Keg Deposit
processing 143: Tortillas
processing 16: Beers
processing 145: Cucumbers
processing 18: Wines
processing 531: Condiments
processing 535: Fruits
processing 152: Flour & Starch
processing 537: Herbs
processing 26: Juices
processing 157: Garlic
processing 1030: Seeds
processing 40: Teas
processing 44: Beans
processing 482: Linens
processing 177: Onions
processing 51: Fuel & Freight/Delivery
processing 1077: Breads
processing 54: Liquor
processing 55: Oils
processing 56: Pork
processing 1730: Leaves
processing 325: Eggs
processing 70: Beef
processing 456: Sodas
processing 329: Poultry
processing 74: Fish
processing 331: Coffee
processing 76: Spices
processing 226: Vinegars
processing 78: Cheese
processing 464: Canned
processing 1272: Squas

In [29]:
level = 'level_3'
for key in the_level[level].keys():
    find_name = df_catergory_lookup.category_id == key
    name = df_catergory_lookup[find_name].category_name
    the_index = name.index[0]

    print ("processing {0}: {1}".format(key, name.ix[the_index]))
    # create POSITIVE training set
    
    key_df = df[level] == key
    null_examples = df['level_4'].isnull()
    positive_examples = df['level_4'].notnull()
    

    # Create training set 
    try:
        category_samples[key] = df[key_df & null_examples]
        category_samples[key] = category_samples[key].append(df[key_df & positive_examples])
        num_test = (len(category_samples[key]) // 10) * 2
        category_positive_samples[key] = category_samples[key][num_test:]
        category_test_samples[key] = category_samples[key][:num_test]        
  
    except:
        print("This category didn't generate any feature set {0}".format(key))
        continue
        
    # create NEGATIVE training set
    find_level_0 = df[level] == key
    name_level_0 = df[find_level_0].level_0
    the_index_level_0 = name_level_0.index[0]
    
    find_level_1 = df[level] == key
    name_level_1 = df[find_level_1].level_1
    the_index_level_1 = name_level_1.index[0]
    
    find_level_2 = df[level] == key
    name_level_2 = df[find_level_2].level_1
    the_index_level_2 = name_level_2.index[0]
    
    level_0_item = df['level_0'] != name_level_0.ix[the_index_level_0]
    level_1_item = df['level_1'] != name_level_1.ix[the_index_level_1]
    level_2_item = df['level_2'] != name_level_2.ix[the_index_level_2]
    level_3_item = df['level_3'] != key
    
    # Create training set 
    try:
        category_negative_samples[key] = df[level_0_item & level_1_item & level_2_item & level_3_item]
        
    except:
        print("This category didn't generate any label set {0}".format(key))
        continue    
    
print("Finished Processing")



processing 387: Oysters
processing 136: Cilantro
processing 1033: Ground Black Pepper
processing 1802: Baby Carrot
processing 1239: Lids
processing 275: Containers
processing 148: Olives
processing 25: Vodka
processing 282: Uniforms
processing 286: Bags
processing 32: Gin
processing 289: Cups
processing 164: Mints
processing 39: Tequila
processing 176: Oranges
processing 1160: Red Wines
processing 50: Iced Tea
processing 52: Whiskey
processing 437: Bacon
processing 1209: Gloves
processing 319: Red Onions
processing 963: Mozzarella
processing 69: Chicken
processing 970: Extra Virgin Olive Oil
processing 1200: Shrimp
processing 1358: Rice
processing 80: Apples
processing 355: Duck
processing 980: Dried Fruits
processing 983: Cheddar
processing 88: Avocados
processing 1113: Kegs
processing 95: Basil
processing 99: Bell Peppers
processing 229: Sauces
processing 998: White Wines
processing 363: Rum
processing 403: Purees
processing 247: Zucchini Squash
processing 1019: Salmon
processing 114

In [30]:
level = 'level_4'
for key in the_level[level].keys():
    find_name = df_catergory_lookup.category_id == key
    name = df_catergory_lookup[find_name].category_name
    the_index = name.index[0]

    print ("processing {0}: {1}".format(key, name.ix[the_index]))
    # create POSITIVE training set
    
    key_df = df[level] == key
    null_examples = df['level_5'].isnull()
    positive_examples = df['level_5'].notnull()
    

    # Create training set 
    try:
        category_samples[key] = df[key_df & null_examples]
        category_samples[key] = category_samples[key].append(df[key_df & positive_examples])
        num_test = (len(category_samples[key]) // 10) * 2
        category_positive_samples[key] = category_samples[key][num_test:]
        category_test_samples[key] = category_samples[key][:num_test]        

    except:
        print("This category didn't generate any feature set {0}".format(key))
        continue
        
    # create NEGATIVE training set
    find_level_0 = df[level] == key
    name_level_0 = df[find_level_0].level_0
    the_index_level_0 = name_level_0.index[0]
    
    find_level_1 = df[level] == key
    name_level_1 = df[find_level_1].level_1
    the_index_level_1 = name_level_1.index[0]
    
    find_level_2 = df[level] == key
    name_level_2 = df[find_level_2].level_1
    the_index_level_2 = name_level_2.index[0]
    
    find_level_3 = df[level] == key
    name_level_3 = df[find_level_3].level_1
    the_index_level_3 = name_level_3.index[0]
    
    level_0_item = df['level_0'] != name_level_0.ix[the_index_level_0]
    level_1_item = df['level_1'] != name_level_1.ix[the_index_level_1]
    level_2_item = df['level_2'] != name_level_2.ix[the_index_level_2]
    level_3_item = df['level_3'] != name_level_3.ix[the_index_level_3]
    level_4_item = df['level_4'] != key
    
    # Create training set 
    try:
        category_negative_samples[key] = df[level_0_item & level_1_item & level_2_item & level_3_item & level_4_item]
        
    except:
        print("This category didn't generate any label set {0}".format(key))
        continue    
    
print("Finished Processing")



processing 35: Bourbon
processing 999: Cabernet Sauvignon
processing 1064: Chardonnay
processing 1002: Sauvignon Blanc
processing 1164: Brut
processing 49: Rye
processing 21: Rose Wine
processing 1015: Chicken Breast
processing 989: Pinot Noir
Finished Processing


## Modeling

With the training sets done, I vectorized them using a mean embedding vectorizer. For each category, I created an AdaBoost binary classifier. I chose to use the AdaBoost after I experimented with several classifiers like LinearSVM and Decision Trees. I first selected Decision Tree since it outperformed the LinearSVM for this problem. However, I quickly realized my aim was to get predicted probabilities as output for the class instead of simple binary output which the Decision Tree gave. For that reason, I switched to AdaBoost, which is a *boosted* decision tree. 

I train each classifier using a k=5 kfold cross-validation scheme. I fitted the resulting classifier and retrieved the predicted probability for each data point in the dataset, which resulted in *k* vectors for *k* categories.

#### Process item label names using Word2Vec

In [31]:
from gensim.models.word2vec import Word2Vec

# Only use item labels as input into word2vec embeddings
# train word2vec on all the item_labels 

w2v_model = Word2Vec(df['item_labels'], size=750, window=5, min_count=5, workers=4)
w2v = dict(zip(w2v_model.index2word, w2v_model.syn0))


In [32]:
w2v_model.init_sims(replace=True) #to trim unneeded model memory = use (much) less RAM.

#### Train *k* classifiers

In [33]:
from time import time, gmtime, strftime
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import RobustScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

seed = 342 # For reproducability
Ada_w2v =  Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), ("Ada", AdaBoostClassifier(n_estimators=10))])


In [36]:
try:
    with open(dataPath+'data/df_category_scores.pkl') as f:    
        scores = json.load(f)
except:
    scores = {}

In [37]:
try:
    pred_proba = joblib.load(dataPath+'data/df_category_pred_proba.pkl')
except:
    pred_proba = {}

successfully opened the pickle file


In [34]:
t0 = time()

if pred_proba <> {}:
    target = open(dataPath+'data/train_log_'+strftime("%d_%b_%Y_%H_%M_%S", gmtime())+'.txt', 'w')
else:
    target = open(dataPath+'data/train_log_'+strftime("%d_%b_%Y_%H_%M_%S", gmtime())+'.txt', 'w')
    
    
        

for key in category_positive_samples:
    if str(key) in scores:
        continue
    if key in scores:
        continue
    else:
        find_name = df_catergory_lookup.category_id == key
        name = df_catergory_lookup[find_name].category_name
        the_index = name.index[0]

        print ("processing {0}: {1}".format(key, name.ix[the_index]))
        target.write("processing {0}: {1}\n".format(key, name.ix[the_index]))
        category_positive_samples[key].loc[:,('is_category')] = 1
        category_negative_samples[key].loc[:,('is_category')] = 0
        
        # Split into training and testing set
        data = category_positive_samples[key].append(category_negative_samples[key], ignore_index=True)
        X = data['item_labels']
        y = data['is_category']
        target.write('{:5.3f}% positive samples\n'.format(len(category_positive_samples[key])/len(X)))
        target.write("Number of datapoints in set {}\n".format(len(X)))
        target.write('Training the model....\n')
        
        scores[key] = cross_val_score(Ada_w2v,  X, y, cv=5, scoring='f1').mean()
        Ada_w2v.fit(X, y)
        pred_proba[key] = Ada_w2v.predict_proba(X)
        
        # persist model
        joblib.dump(Ada_w2v, 'data/category_classifier/catg_'+str(key)+'.pkl')
        print('Finished training category{0}: {1}\n'.format(key, name.ix[the_index]))
        target.write('Finished training category{0}: {1}\n\n'.format(key, name.ix[the_index]))
        
target.close()    
print("done in %0.3fs" % (time() - t0))
joblib.dump(scores, dataPath+'data/df_category_scores.pkl')



NameError: name 'pred_proba' is not defined

## Get *k* vectors

In [44]:
for key in category_positive_samples:
    df_ = category_positive_samples[key].append(category_negative_samples[key])
    scores_ = pd.DataFrame(pred_proba[key], columns = ['Neg','catg_'+str(key)])
    scores_.drop(['Neg'], axis = 1, inplace = True)
    scores_ = scores_.set_index(df_.index)
    df.ix[df_.index, 'catg_'+str(key)] = scores_

### Write vectors to file

In [53]:
df.to_pickle(dataPath+'data/df_data_vectors.dat')
joblib.dump(pred_proba,dataPath+'data/df_category_pred_proba.pkl')

### Final Model Architecture
<figure>
  <img src="images/hc_4.png" style="width: 450px;">
</figure>
After training the *k=141* classifiers, I extracted the *k* vectors; I carefully combined them with the engineered features, and the other raw metadata taking care to ensure that I assigned the right probabilities to the right data points in my training set. I feed these features and labels into a multi-label, multi-class AdaBoost classifier. 

I feed these features and labels into a multi-label, multi-class Random Forest classifier. I took 75% of the data for training and 25% for testing. Once again I do a 5 fold cross-validation scheme, fit the final classifier, and retrieve the predicted class probabilities.


#### Get Data for Final Classifier

In [54]:
the_indices = {}

In [55]:
for key in category_positive_samples:
    key_df = df['label'] == key
    the_indices[key] = df[key_df].index

In [56]:
X = pd.DataFrame()

In [57]:
for key in the_indices:
    X = X.append(pd.DataFrame(df.loc[the_indices[key]], columns = df.keys()))

In [58]:
X.drop([u'mapped_category', u'category',
u'item_name',         u'level_0',
u'level_1',         u'level_2',         u'level_3',
u'level_4',         u'level_5',         u'level_6',
u'mapped_level_0',  u'mapped_level_1',  u'mapped_level_2',
u'mapped_level_3',  u'mapped_level_4',  u'mapped_level_5',
u'mapped_level_6',   u'item_labels'], axis = 1, inplace=True)

In [134]:

X.tail()

Unnamed: 0,item_id,price_stddev,primary_unit,price_mean,vendor_id,branch_lenght,item_name_match,catg_256,catg_1,catg_2,...,catg_986,catg_989,catg_998,catg_999,catg_1002,catg_495,catg_499,catg_1015,catg_1019,catg_1021
125487,602317,1.37651e-07,3,7.99,5966,4,0.0,0.346282,0.56559,0.3602,...,0.232862,0.2473,0.24372,0.255702,0.191225,0.361658,0.509521,0.0,0.0,0.45604
125878,609095,0.0,4,16.81,7491,4,1.0,0.356767,0.557353,0.3602,...,0.283522,0.2473,0.223166,0.007575,0.191225,0.334251,0.37906,0.0,0.0,0.551332
125955,610507,0.0,1,18.99,5966,4,1.0,0.346282,0.56559,0.3602,...,0.223676,0.177823,0.24372,0.221922,0.191225,0.361658,0.471559,0.0,0.0,0.50135
126064,612597,0.0,3,8.25,12059,4,0.0,0.386763,0.541789,0.377839,...,0.262988,0.284572,0.350169,0.221922,0.214968,0.334251,0.415429,0.0,0.0,0.463727
127054,512281,0.0,18,23.5,1246,4,0.0,0.386763,0.541789,0.377839,...,0.262988,0.284572,0.350169,0.221922,0.293972,0.334251,0.347207,0.0,0.0,0.463727


#### Drop all rows whose label is NaN

In [62]:
X = X[X['label'].notnull()]
X = X.fillna(0)

In [63]:
y = X['label']
X.drop(['label'], axis = 1, inplace = True)

In [82]:
print("Number of datapoints in set {}".format(len(X)))

# First, decide how many training vs test samples you want
# I am going to use a train, validate and test split set

num_train = int(len(y) * .80)

# Split the data into training and testing sets

try:
    X_train, y_train, X_test, y_test = shuffle_split_data(X, y)
    print ("Successfully shuffled and split the data!")
except:
    print ("Something went wrong with shuffling and splitting the data.")
    

Number of datapoints in set 32788
Successfully shuffled and split the data!


## Train final classifier

In [115]:
models = {
          'DecisionTree': DecisionTreeClassifier(random_state=seed),
          'SVC': LinearSVC(random_state=seed),
          'RandomForest': RandomForestClassifier(random_state=seed),
          'AdaBoost': AdaBoostClassifier(n_estimators=10, random_state=seed)
         }

scaler = RobustScaler()
X_transform = scaler.fit_transform(X_train)

print('CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS\n')
print('{:20}{:^15}{:^10}{:>10}'.format('CLASSIFIER', 'MEAN SCORE %', 'STD DEV %', 'TIME'))


for clf_name, clf in models.iteritems():
    t0 = time()
    results = cross_val_score(clf, X_transform, y_train, cv=5)
    t1 = time() - t0
    print('{:20}{:^15.2f}{:^10.2f}{:>10.2f}secs'.format(clf_name, results.mean()*100, results.std()*100, t1))
    

CLASSIFICATION RESULTS OF BASELINE CLASSIFIERS

CLASSIFIER           MEAN SCORE %  STD DEV %    TIME   
RandomForest             99.39        0.11         7.28secs
DecisionTree             99.15        0.21        10.64secs
SVC                     100.00        0.01        93.60secs
AdaBoost                 24.91        4.38        15.99secs


In [121]:

clf = RandomForestClassifier(random_state=seed)
clf.fit(X_transform, y_train)
X_test_X_transform = scaler.transform(X_test)
final_preds = clf.predict(X_test_X_transform)
precision, recall, fbeta_score, support = score(y_test, final_preds)


print ("Precision: {:10.3f}\nRecall: {:^10.3f}\nF Score{:^10.3f}\nSupport:{:^10.3f}".format(precision.mean()*100, 
                                                 recall.mean()*100, fbeta_score.mean()*100, len(support)))


Precision:          96.943
Recall:   95.149  
F Score  95.761  
Support: 118.000  


In [143]:
C = confusion_matrix(y_test, final_preds)

# Result
With the understanding that chance is 1/118 * 100 = 0.8%, my approach achieves:    


|Metric|Value|
|---|---|
|Precision:| 96.943  |
|Recall:| 95.149 |
|F Score: | 95.761  |
|Classes:  |  118  |


In [135]:
for key in category_positive_samples:
        find_name = df_catergory_lookup.category_id == key
        name = df_catergory_lookup[find_name].category_name
        the_index = name.index[0]

        

In [136]:
outX = pd.DataFrame(index=y_test.index)
outX = outX.join(y_test)
outX['predicted'] = final_preds
outX['predicted_name'] = ''
outX['item_name'] = pd.DataFrame(df.loc[outX.index])['item_name']
outX['mapped_level_0'] = pd.DataFrame(df.loc[outX.index])['mapped_level_0']
outX['level_0'] = pd.DataFrame(df.loc[outX.index])['level_0']
outX['mapped_level_1'] = pd.DataFrame(df.loc[outX.index])['mapped_level_1']
outX['level_1'] = pd.DataFrame(df.loc[outX.index])['level_1']
outX['mapped_level_2'] = pd.DataFrame(df.loc[outX.index])['mapped_level_2']
outX['level_2'] = pd.DataFrame(df.loc[outX.index])['level_2']
outX['mapped_level_3'] = pd.DataFrame(df.loc[outX.index])['mapped_level_3']
outX['level_3'] = pd.DataFrame(df.loc[outX.index])['level_3']
outX['mapped_level_4'] = pd.DataFrame(df.loc[outX.index])['mapped_level_4']
outX['level_4'] = pd.DataFrame(df.loc[outX.index])['level_4']
outX['mapped_level_5'] = pd.DataFrame(df.loc[outX.index])['mapped_level_5']
outX['level_5'] = pd.DataFrame(df.loc[outX.index])['level_5']

In [137]:
j = 0
for i in final_preds:
    find_name = df_catergory_lookup.category_id == i
    outX.ix[outX.index[j], 'predicted_name'] = df_catergory_lookup[find_name]['category_name'].values[0]
    j += 1

In [142]:
#score(y_test, final_preds)
#outX.head(3)

In [None]:
# Given a category find ancestors


#### Retrain classifier for category k = 235, chocolate
The value of this method is that I can take a look at the results, determine which individual text-based classifiers have poor results, and optimize and tune that specific classifier. In the case of the chocolate classifier, I see that the minority class is minuscule. To deal with this great imbalance in the data, I synthetically resample it and retrain the classifier.

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 

key = 235

data = category_positive_samples[key].append(category_negative_samples[key], ignore_index=True)
X_235 = data['item_labels']
y_235 = data['is_category']
        
vectorizer = MeanEmbeddingVectorizer(w2v)
vectorizer.fit(X_235, y_235)  

sm = SMOTE(random_state=seed)

print('Original dataset shape {}'.format(Counter(y_235)))
X_res, y_res = sm.fit_sample(vectorizer.transform(X_235) , y_235)
print('Resampled dataset shape {}'.format(Counter(y_res)))

In [None]:
print('Training the model....')
clf = AdaBoostClassifier(n_estimators=10)
scores_235 = cross_val_score(clf, X_res, y_res, cv=5, scoring='f1').mean()
clf.fit(X_res, y_res)
pred_proba_235 = clf.predict_proba(X_res)
print('Finished training category\n\n')

In [None]:
X_ = vectorizer.transform(X_235)
clf.fit(X_, y_235)
pred_proba_235 = clf.predict_proba(X_)

In [None]:
scores[key] = scores_235


In [None]:
key = 235
df_ = category_positive_samples[key].append(category_negative_samples[key])
scores_ = pd.DataFrame(pred_proba_235, columns = ['Neg','catg_'+str(key)])
scores_.drop(['Neg'], axis = 1, inplace = True)
scores_ = scores_.set_index(df_.index)
df.ix[df_.index, 'catg_'+str(key)] = scores_