In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
dataset = pd.read_csv('OSM_training_data/ensemble_dataset.csv')

In [3]:
dataset.head()

cat_dict = {0:'ADULT', 1:'COUNTERFEIT', 2:'LEGIT', 3:'PHARMA', 4:'SMOKE', 5:'TMS', 6:'WEAPON'}
dataset['category_number'] = dataset['category_number'].map(cat_dict)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
import nltk

Using TensorFlow backend.


In [5]:
data = dataset.drop_duplicates(subset='name')
data.head()

Unnamed: 0.1,Unnamed: 0,index,id,name,bodytxt,seller_id,tagtxt,price,category_number,risk,combinedtxt_noNumSWPunc_morethan2char,combinedtxt,seller_id_indices
0,0,0,58d3365f3f6f673be6804786,Fat Burners & Thermogenics BLOODSHR3D (WAR EDI...,BLOODSHR3D (WAR EDITION) Ultra Premium Fat Bur...,53665695,,282.84,LEGIT,1,Fat Burners Thermogenics BLOODSHRD WAR EDITION...,Fat Burners & Thermogenics BLOODSHR3D (WAR EDI...,[526]
1,1,1,58d33f023f6f673be68083ee,Vitamins Hylands Cell Salts #12 Silicea 30X Ta...,"Hyland's Cell Salts #12 Silicea 30X Tablets, N...",53665695,,145.88,LEGIT,1,Vitamins Hylands Cell Salts Silicea Tablets Na...,Vitamins Hylands Cell Salts #12 Silicea 30X Ta...,[526]
2,2,2,58d342283f6f673be680a84e,Vitamins Biotics Research - Detoxification 4oz,Biotics Research - Detoxification 4oz Product ...,53665695,,248.77,LEGIT,1,Vitamins Biotics Research Detoxification Bioti...,Vitamins Biotics Research - Detoxification 4oz...,[526]
3,3,3,58d39b683f6f673be680f255,HEWLETT-PACKARD C3903A Toner 4000 Page-Yield B...,HEWLETT-PACKARD C3903A Toner 4000 Page-Yield B...,53889844,,492.2,LEGIT,1,HEWLETT PACKARD Toner Page Yield Black Clear S...,HEWLETT-PACKARD C3903A Toner 4000 Page-Yield B...,[563]
4,4,4,58d4862b3f6f673be6813014,Herbal Supplements Natural Natural Blood Press...,Natural Blood Pressure Supplement: Blood Press...,53665695,,233.52,LEGIT,1,Herbal Supplements Natural Natural Blood Press...,Herbal Supplements Natural Natural Blood Press...,[526]


In [6]:
#features = ['name', 'bodytxt', 'tagtxt', 'price', 'category_number', 'combinedtxt_noNumSWPunc_morethan2char']
features = ['name', "category_number"]
df = data[features]
df.head()

Unnamed: 0,name,category_number
0,Fat Burners & Thermogenics BLOODSHR3D (WAR EDI...,LEGIT
1,Vitamins Hylands Cell Salts #12 Silicea 30X Ta...,LEGIT
2,Vitamins Biotics Research - Detoxification 4oz,LEGIT
3,HEWLETT-PACKARD C3903A Toner 4000 Page-Yield B...,LEGIT
4,Herbal Supplements Natural Natural Blood Press...,LEGIT


In [7]:
cat_dict_reverse = {'ADULT':0, 'COUNTERFEIT':1, 'LEGIT':2, 'PHARMA':3, 'SMOKE':4,'TMS':5, 'WEAPON':6}
df['category_number'] = df['category_number'].map(cat_dict_reverse)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,name,category_number
0,Fat Burners & Thermogenics BLOODSHR3D (WAR EDI...,2
1,Vitamins Hylands Cell Salts #12 Silicea 30X Ta...,2
2,Vitamins Biotics Research - Detoxification 4oz,2
3,HEWLETT-PACKARD C3903A Toner 4000 Page-Yield B...,2
4,Herbal Supplements Natural Natural Blood Press...,2


In [8]:
docs = df['name'].values
labels = df['category_number'].values


In [9]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(labels)
encoded_Y = encoder.transform(labels)
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_Y)

y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [10]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
# pad documents to a max length of 250 words 
max_length = 250
# using Keras's built in pad_sequences 
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


In [11]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [12]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size = 0.2, random_state = 0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((44379, 250), (11095, 250), (44379, 7), (11095, 7))

In [14]:
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Bidirectional, RepeatVector, Permute, Multiply, Lambda
from keras.models import Model
from keras import backend as K 

"""
keras.layers.Permute(dims)
Permutes the dimensions of the input according to a given pattern.

Example
-------

model.add(Permute((2, 1), input_shape=(10, 64)))
# now: model.output_shape == (None, 64, 10)
# note: `None` is the batch dimension

"""

import os
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

# Reference
# ---------
# https://www.youtube.com/watch?v=oaV_Fv5DwUM&t=7s

#model = Sequential()
word_dim = 100 

seq_input = Input(shape = (max_length, ), dtype = 'int32')
e = Embedding(vocab_size, word_dim, weights=[embedding_matrix], input_length=250, trainable=False)(seq_input)
activations = Bidirectional(LSTM(764, return_sequences = True))(e)


## --- Attention Mecchanism -----
attention = Dense(1, activation = "tanh")(activations)
attention = Flatten()(attention)
attention = Activation('softmax')(attention)
attention = RepeatVector(764 * 2)(attention)
attention = Permute([2, 1])(attention)

sent_rep = Multiply()([activations, attention])
sent_rep = Lambda(lambda xin: K.sum(xin, axis = -2), output_shape = (764 * 2, ))(sent_rep)

## --- Attention Mecchanism -----

model = Dense(len(y[0]), activation = 'softmax')(sent_rep)

model = Model(inputs = seq_input, outputs = model)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 100)     3529500     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 250, 1528)    5286880     embedding_1[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 250, 1)       1529        bidirectional_1[0][0]            
__________________________________________________________________________________________________
flatten_1 

In [15]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', patience = 5, verbose=1)

In [19]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred,axis=1)

In [16]:
hist = model.fit(X_train, y_train, validation_data =(X_test, y_test), epochs=50, verbose=1, callbacks = [es])


Train on 44379 samples, validate on 11095 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 00010: early stopping


In [17]:
from sklearn.metrics import f1_score, classification_report

y_pred = model.predict_classes(X_test)
# y_pred = model.predict(X_test)
y_test_int =np.argmax(y_test, axis=1)

f1_score(y_test_int, y_pred, average = 'micro')

AttributeError: 'Model' object has no attribute 'predict_classes'

In [None]:
target_name = ['ADULT', 'COUNTERFEIT', 'LEGIT', 'PHARMA', 'SMOKE', 'TMS', 'WEAPON']
print(classification_report(y_test_int, y_pred, target_names=target_name))

In [None]:
model.save('glove_50_biLSTM_attention.h5')

In [None]:
from keras.models import load_model
model = load_model('glove_50_biLSTM_attention.h5')


In [None]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)