In [1]:
import pickle
import glob
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

import tensorflow as tf
from helpers import *

## New examples

In [2]:
import pandas as pd
from IPython.display import Image, HTML

images = list(map(lambda x: '<img src='+str(x)+'/>',list(reversed(glob.glob('../data/*.jpg')))))
text = list(map(lambda x: open(x,'r').read(), glob.glob('../data/*.txt')))
title = ['Blue Raspberry Rock Candy Crystals (1 Pound Bag)',
                            'Cinnamon toothpicks',
                            'Magno shower gel 700 ml']

df = pd.DataFrame({'Image': images,
                   'Title': title,
                   'Cat1':['grocery gourmet food','health personal care','beauty'],
                   'Cat2':['candy chocolate','personal care','bath body'],
                   'Text': text}).reindex_axis(['Image','Title','Cat1','Cat2','Text'], axis=1)


pd.set_option('display.max_colwidth', -1)
HTML(df.to_html(escape=False))

Unnamed: 0,Image,Title,Cat1,Cat2,Text
0,,Blue Raspberry Rock Candy Crystals (1 Pound Bag),grocery gourmet food,candy chocolate,"Good product. Too good. The reason two stars are missing from my rating is because I sold an ounce to a guy named Tat and now he won't leave me alone. He shows up outside my apartment, at my school, and even when I'm out with my family. He's always begging me for ""the good stuff."" Seeing how often I'm having to re-up, I think I'll try to cut a deal with the Superior Nut Company for wholesale shipments. They can ship it to me through the Greeks down at the docks.\n"
1,,Cinnamon toothpicks,health personal care,personal care,"WoW! Are these good and hot! That's OK because I like hot foods, peppers, etc. I am dieting and when I get hungry I get one of these. They will burn your tongue if you leave it on your tongue long. I love them and would highly recommend these over anything\n"
2,,Magno shower gel 700 ml,beauty,bath body,"I have been using this product along with the soap for years and always come back to it. If you love bath and shower gels, and enjoy an occasional bubble bath, then MAGNO is for you. It's fragrance is like no other and it's great on the skin. I've not used bath soap in years - only MAGNO\n"


## Let's transform the sentences so that predictions can be made

In [3]:
with open('../data/prepared_data.pickle', 'rb') as data:
    data_dic = pickle.load(data)

In [4]:
vocab = data_dic['full_df']['vocab']
train = data_dic['full_df']['train']

In [5]:
def replace_words(strg, vocab=vocab):
    line = strg.split()
    line = ["rareword" if x not in vocab.keys() else x for x in line]
    result = ' '.join(line)
    return result

In [6]:
text

['Good product. Too good. The reason two stars are missing from my rating is because I sold an ounce to a guy named Tat and now he won\'t leave me alone. He shows up outside my apartment, at my school, and even when I\'m out with my family. He\'s always begging me for "the good stuff." Seeing how often I\'m having to re-up, I think I\'ll try to cut a deal with the Superior Nut Company for wholesale shipments. They can ship it to me through the Greeks down at the docks.\n',
 "WoW! Are these good and hot! That's OK because I like hot foods, peppers, etc. I am dieting and when I get hungry I get one of these. They will burn your tongue if you leave it on your tongue long. I love them and would highly recommend these over anything\n",
 "I have been using this product along with the soap for years and always come back to it. If you love bath and shower gels, and enjoy an occasional bubble bath, then MAGNO is for you. It's fragrance is like no other and it's great on the skin. I've not used 

In [7]:
cleaned_input = list(map(replace_words,list(map(cleanData,text))))
cleaned_input1 = [add_placeholders(x,100) if len(x.split())<=100 else ' '.join(x.split()[0:100]) for x in cleaned_input]
final = np.array([create_sentence_vectors(x, vocab) for x in cleaned_input1])  

In [8]:
own_text = ['I love its pine scent in the morning. I would stay hours under the running water',
           'Thomas loves playing with this new ball. It keeps him busy when he is alone',
           'This is awesome stuff for my summer bbq in the garden! All my friends love it!']

own_cleaned_input = list(map(replace_words,list(map(cleanData,own_text))))
own_cleaned_input1 = [add_placeholders(x,100) if len(x.split())<=100 else ' '.join(x.split()[0:100]) for x in own_cleaned_input]
own_final = np.array([create_sentence_vectors(x, vocab) for x in own_cleaned_input1]) 

In [9]:
final = np.concatenate((final,own_final),axis=0)

## Let's make predictions

In [10]:
SEQ_LENGTH = 100
BATCH_SIZE = final.shape[0]

NUM_LABELS_1 = 6
NUM_LABELS_2 = 64

VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128

FILTER_SIZE_1 = 3
FILTER_SIZE_2 = 4
FILTER_SIZE_3 = 5
NUM_FILTERS = 128

## Important to set it to 1! Otherwise model will give every time different prediction
keep_prob = 1.0
l2_reg_lambda=0.001


tf_test_dataset = tf.placeholder(tf.int32, shape=(BATCH_SIZE, SEQ_LENGTH))


# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0)

## Embedding layer

embed_weigths = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))
    
## Convolutional layers
## Change to True if run again
with tf.variable_scope("vars", reuse = None):
    
    conv1_weights = tf.get_variable("conv1_w", shape=[FILTER_SIZE_1, EMBED_SIZE, 1, NUM_FILTERS],\
                                                    initializer=tf.contrib.layers.xavier_initializer_conv2d())
    conv1_biases = tf.Variable(tf.constant(0.1, shape=[NUM_FILTERS]), name = "conv1_b")

    conv2_weights = tf.get_variable("conv2_w", shape=[FILTER_SIZE_2, EMBED_SIZE, 1, NUM_FILTERS],\
                                                    initializer=tf.contrib.layers.xavier_initializer_conv2d())
    conv2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_FILTERS]), name = "conv2_b")

    conv3_weights = tf.get_variable("conv3_w", shape=[FILTER_SIZE_3, EMBED_SIZE, 1, NUM_FILTERS],\
                                                    initializer=tf.contrib.layers.xavier_initializer_conv2d())
    conv3_biases = tf.Variable(tf.constant(0.1, shape=[NUM_FILTERS]), name = "conv3_b")

## Fully connected layer

    fc1_weights = tf.get_variable("fc1_w", shape=[3*NUM_FILTERS, NUM_LABELS_1],\
                                                initializer=tf.contrib.layers.xavier_initializer())
    fc1_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS_1]), name = "fc1_b")
    
    fc2_weights = tf.get_variable("fc2_w", shape=[3*NUM_FILTERS, NUM_LABELS_2],\
                                                initializer=tf.contrib.layers.xavier_initializer())
    fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS_2]), name = "fc2_b")


# Model:
def model(data, keep_prob = keep_prob): 
    
    ## Embedding layer
    ## input shape:[BATCH_SIZE, SEQ_LENGTH]
    ## output shape: [BATCH_SIZE, SEQ_LENGTH, EMBED_SIZE, 1]
    
    with tf.device('/cpu:0'), tf.name_scope("word2vec_embedding"):    
        embed_chars = tf.nn.embedding_lookup(embed_weigths, data)
        embed_chars = tf.expand_dims(embed_chars, -1)
    
    ## Conv1+ReLu+max pooling
    ## input shape: [BATCH_SIZE, SEQ_LENGTH, EMBED_SIZE, 1]
    ## output shape: [BATCH_SIZE, 1, 1, NUM_FILTERS]
    
    with tf.name_scope("conv1-maxpool-3"):
        conv1 = tf.nn.conv2d(embed_chars, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))
        pool1 = tf.nn.max_pool(relu1, ksize=[1, SEQ_LENGTH-FILTER_SIZE_1+1, 1, 1],
                               strides=[1, 1, 1, 1], padding='VALID')
    ## Conv2+ReLu+max pooling
    ## input shape: [BATCH_SIZE, SEQ_LENGTH, EMBED_SIZE, 1]
    ## output shape: [BATCH_SIZE, 1, 1, NUM_FILTERS]
    
    with tf.name_scope("conv2-maxpool-4"):
        conv2 = tf.nn.conv2d(embed_chars, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases))
        pool2 = tf.nn.max_pool(relu2, ksize=[1, SEQ_LENGTH-FILTER_SIZE_2+1, 1, 1],
                               strides=[1, 1, 1, 1], padding='VALID')
    
    ## Conv3+ReLu+max pooling
    ## input shape: [BATCH_SIZE, SEQ_LENGTH, EMBED_SIZE, 1]
    ## output shape: [BATCH_SIZE, 1, 1, NUM_FILTERS]
    
    with tf.name_scope("conv3-maxpool-5"):
        conv3 = tf.nn.conv2d(embed_chars, conv3_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu3 = tf.nn.relu(tf.nn.bias_add(conv3, conv3_biases))
        pool3 = tf.nn.max_pool(relu3, ksize=[1, SEQ_LENGTH-FILTER_SIZE_3+1, 1, 1],
                               strides=[1, 1, 1, 1], padding='VALID')
    
    ## Combine pooled features + flatten
    ## input shape: [BATCH_SIZE, 1, 1, NUM_FILTERS]
    ## output shape: [BATCH_SIZE, 3, 1, 1, NUM_FILTERS]
    
    with tf.name_scope("combine_flatten"):
        pooled_outputs = [pool1, pool2, pool3]
        h_pool = tf.concat(pooled_outputs,3)
        h_pool_flat = tf.reshape(h_pool, [-1, NUM_FILTERS*3])
    
    with tf.name_scope("dropout"):
        h_pool_flat = tf.nn.dropout(h_pool_flat, keep_prob)
    
    ## Fully connected layer
    with tf.name_scope("output"):
        return tf.matmul(h_pool_flat, fc1_weights) + fc1_biases,\
               tf.matmul(h_pool_flat, fc2_weights) + fc2_biases
  

logits_1, logits_2 = model(tf_test_dataset)

test_prediction =  [tf.nn.softmax(model(tf_test_dataset)[0]),\
                    tf.nn.softmax(model(tf_test_dataset)[1])]

saver = tf.train.Saver()

In [11]:
with tf.Session() as session:
    saver.restore(session, "../models/cat2_cnn_advanced.ckpt")
    test_prediction = session.run(test_prediction,
                                  feed_dict={tf_test_dataset : final,})

INFO:tensorflow:Restoring parameters from ../models/cat2_cnn_advanced.ckpt


## Were the predictions good?

In [12]:
cat1_df = train[["cat1_num","cat1"]].drop_duplicates().sort_values(by="cat1_num")
cat1_dict = dict(zip(cat1_df.cat1_num,cat1_df.cat1))

cat1_df

Unnamed: 0,cat1_num,cat1
1322085,0,baby products
672469,1,beauty
40233,2,grocery gourmet food
998811,3,health personal care
218329,4,pet supplies
448659,5,toys games


In [13]:
cat2_df = train[["cat2_num","cat2"]].drop_duplicates().sort_values(by="cat2_num")
cat2_dict = dict(zip(cat2_df.cat2_num,cat2_df.cat2))

cat2_df

Unnamed: 0,cat2_num,cat2
364078,0,action toy figures
350549,1,arts crafts
1173258,2,baby child care
300981,3,baby toddler toys
14809,4,baby food
705071,5,bath body
1142460,6,bathing skin care
63840,7,beverages
144693,8,birds
89183,9,breads bakery


In [14]:
for i in range(len(title)):
    print('Predicted category 1 for product %s is %.0f (%s)' %(title[i],
                                                               np.argmax(test_prediction[0],1)[i],
                                                               cat1_dict.get(np.argmax(test_prediction[0],1)[i])))
    
    print('Predicted category 2 for product %s is %.0f (%s)' %(title[i],
                                                               np.argmax(test_prediction[1],1)[i],
                                                               cat2_dict.get(np.argmax(test_prediction[1],1)[i])))

Predicted category 1 for product Blue Raspberry Rock Candy Crystals (1 Pound Bag) is 2 ( grocery  gourmet food)
Predicted category 2 for product Blue Raspberry Rock Candy Crystals (1 Pound Bag) is 45 (nutrition  wellness)
Predicted category 1 for product Cinnamon toothpicks is 3 ( health  personal care)
Predicted category 2 for product Cinnamon toothpicks is 34 (health care)
Predicted category 1 for product Magno shower gel 700 ml is 1 ( beauty)
Predicted category 2 for product Magno shower gel 700 ml is 5 (bath  body)


In [19]:
for i in range(len(own_text)):
    print('Predicted category 1 for product \n \t %s \n is %.0f (%s)' %(own_text[i],
                                                               np.argmax(test_prediction[0],1)[i+3],
                                                               cat1_dict.get(np.argmax(test_prediction[0],1)[i+3])))
    
    print('Predicted category 2 for product \n \t %s \n is %.0f (%s)' %(own_text[i],
                                                               np.argmax(test_prediction[1],1)[i+3],
                                                               cat2_dict.get(np.argmax(test_prediction[1],1)[i+3])))

Predicted category 1 for product 
 	 I love its pine scent in the morning. I would stay hours under the running water 
 is 1 ( beauty)
Predicted category 2 for product 
 	 I love its pine scent in the morning. I would stay hours under the running water 
 is 25 (fragrance)
Predicted category 1 for product 
 	 Thomas loves playing with this new ball. It keeps him busy when he is alone 
 is 4 ( pet supplies)
Predicted category 2 for product 
 	 Thomas loves playing with this new ball. It keeps him busy when he is alone 
 is 19 (dogs)
Predicted category 1 for product 
 	 This is awesome stuff for my summer bbq in the garden! All my friends love it! 
 is 5 ( toys  games)
Predicted category 2 for product 
 	 This is awesome stuff for my summer bbq in the garden! All my friends love it! 
 is 25 (fragrance)
