In [1]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
dataset = pd.read_csv('transactions_dataset.csv')
dataset.head()

Unnamed: 0,Text,CategoryId
0,Lidl )))),2
1,Deposit Rent,1
2,McDonalds Banegaards )))),3
3,McDonalds Banegaards )))),3
4,Burger Shack Horsens )))),3


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, 0]
y = dataset.iloc[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((505,), (127,), (505,), (127,))

In [6]:
# Create a custom standardization function
def custom_standardization(input_data):
  text = tf.strings.lower(input_data)
  text = tf.strings.regex_replace(text, '[^a-zA-Z0-9]', ' ')
  # text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
  # text = [ps.stem(word) for word in text if not word in set(stopwords.words('danish'))]
  return text

# Vocabulary size and number of words in a sequence.
vocab_size = 200
sequence_length = 30

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(X_train.to_list())

In [7]:
embedding_dim=5

model = tf.keras.models.Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"), # Embed a 200 word vocabulary into 5 dimensions
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(10)
])

In [8]:
predictions = model(X_train.to_list()[:3]).numpy()
predictions

Consider rewriting this model with the Functional API.


array([[-0.01182186, -0.02875652,  0.0252192 , -0.0032307 ,  0.00304107,
         0.02126542, -0.02294359, -0.01695609,  0.00255844, -0.01028241],
       [-0.01177343, -0.02928187,  0.02556214, -0.00449242,  0.00301963,
         0.02134806, -0.02292697, -0.01704218,  0.00263753, -0.01110735],
       [-0.01201282, -0.02799431,  0.02609291, -0.00387859,  0.00263937,
         0.02072632, -0.0211834 , -0.01794324,  0.00072025, -0.01075994]],
      dtype=float32)

In [9]:
tf.nn.softmax(predictions).numpy()

array([[0.09922574, 0.09755953, 0.10297009, 0.10008188, 0.10071154,
        0.10256377, 0.09812829, 0.0987176 , 0.10066295, 0.09937862],
       [0.09925158, 0.09752896, 0.10302723, 0.09997686, 0.10073072,
        0.10259398, 0.09815072, 0.09873001, 0.10069224, 0.09931771],
       [0.09922385, 0.09765071, 0.10307781, 0.10003425, 0.10068841,
        0.10252612, 0.09831807, 0.09863716, 0.10049535, 0.09934825]],
      dtype=float32)

In [10]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [11]:
loss_fn(y_train.to_list()[:3], predictions).numpy()

2.2822666

In [12]:
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

In [13]:
model.fit(X_train, y_train, epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fe57f29f940>

In [14]:
model.evaluate(X_test,  y_test, verbose=2)

4/4 - 0s - loss: 0.3042 - accuracy: 0.9291


[0.30415064096450806, 0.9291338324546814]

In [18]:
categories = {
    0:'Automobile and Transport',
    1:'Housing and Real-Estate',
    2:'Groceries',
    3:'Recreation and Leisure',
    4:'Health and Well Being',
    5:'Hobby and Knowledge',
    6:'Clothes and Equipment',
    7:'Cash and Credit',
    8:'Financial Services',
    9:'Other'
}

def get_category_by_id(id):
    return categories[id];

inputs = ['lidl', 'netto ))', 'udemy', 'kfc', 'rent']
predictions = model.predict_classes(inputs)
{ inputs[id]: get_category_by_id(predictions[id]) for id in range(predictions.size) }



{'kfc': 'Recreation and Leisure',
 'lidl': 'Groceries',
 'netflix': 'Other',
 'netto ))': 'Groceries',
 'rent': 'Housing and Real-Estate',
 'udemy': 'Other'}