In [None]:
!pip install -U keras
!pip install -U tensorflow
!pip install transformers
!pip install tensorflow_text
!pip install -q -U "tensorflow-text==2.8.*"

Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 5.1 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109
Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 34.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 27.5 MB/s 
[?25hCollecting pyyaml>=5.1
  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Input, LSTM, Bidirectional, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
import transformers
from keras.layers.embeddings import Embedding

In [None]:
df = pd.read_csv('submission.csv')
df.head()

Unnamed: 0,Inv_Id,Product_Category
0,15003,CLASS-1758
1,15008,CLASS-1522
2,15013,CLASS-1522
3,15019,CLASS-1376
4,15020,CLASS-1758


In [None]:
df['Product_Category'].value_counts()


CLASS-1758    641
CLASS-1274    451
CLASS-1522    396
CLASS-1250    191
CLASS-1376    151
CLASS-1963     80
CLASS-1249     79
CLASS-1721     54
CLASS-1828     47
CLASS-2141     46
CLASS-1567     36
CLASS-1919     34
CLASS-1850     29
CLASS-1477     27
CLASS-1429     18
CLASS-2241     17
CLASS-2112     17
CLASS-1870     17
CLASS-1322     17
CLASS-2003     17
CLASS-1983     14
CLASS-1964     13
CLASS-1309     11
CLASS-1867      8
CLASS-1770      8
CLASS-1805      6
CLASS-1294      4
CLASS-1652      4
CLASS-1957      3
CLASS-2038      3
CLASS-1248      3
CLASS-1688      2
CLASS-2146      1
CLASS-2015      1
Name: Product_Category, dtype: int64

In [None]:
y = pd.get_dummies(df['Product_Category']).values
df = df.astype({'Inv_Id':'string'})

print('Shape of label tensor:', y.shape)
print('Inv_Id type: ', type(df['Inv_Id'][0]))

Shape of label tensor: (2446, 34)
Inv_Id type:  <class 'str'>


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Inv_Id'], y, test_size=0.4, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

print(f'The train dataset has {y_train.shape[0]} samples.')
print(f'The validation dataset has {y_val.shape[0]} samples.')
print(f'The test dataset has {y_test.shape[0]} samples.')

The train dataset has 1320 samples.
The validation dataset has 147 samples.
The test dataset has 979 samples.


In [None]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [None]:
bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [None]:
bert_preprocess = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
num_neurons = 64
num_embeddings = 128
num_classes = len(df['Product_Category'].unique())

print(f'num_classes: {num_classes}')

num_classes: 34


In [None]:
# Bert model
text_input = Input(shape=(), dtype=tf.string, name='text')

preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
preprocessed_inputs = preprocessing_layer(text_input)

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
encoded_inputs = encoder(preprocessed_inputs)

bert_output = encoded_inputs['pooled_output']

model = Model(text_input, bert_output)

model.summary()

In [None]:
# Reshape layers
inputs = model.inputs[:2]

newReshape = tf.keras.layers.Reshape((768,1), name='newReshape')(model.output)

In [None]:
# LSTM model
lstm = LSTM(256)(newReshape)
lstm_output = Dense(768, activation='relu')(lstm)
preds = Dense(num_classes, activation='sigmoid')(lstm_output)

model = Model(inputs, preds)

model.summary()

In [None]:
adam_optmizer = Adam(learning_rate=0.00001, decay = 1e-6)

model.compile(loss="categorical_crossentropy", optimizer=adam_optmizer, metrics=['accuracy'])
history_fine = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10)

In [None]:
loss = history_fine.history['loss']
val_loss = history_fine.history['val_loss']


plt.plot(history_fine.history['loss'])
plt.plot(history_fine.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training set','validation set'], loc='upper left')
plt.show()

Performance evaluation

In [None]:
y_probabilities = model.predict(X_test)
y_pred = np.argmax(y_probabilities, axis=1)

In [None]:
print(classification_report(y_test, y_pred))