In [None]:
import os
from google.colab import auth
import gspread
from google.auth import default
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization
from keras.layers import Embedding
from sklearn.model_selection import train_test_split

In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

Preparing Data

In [None]:
# read csv in as df
df = pd.read_csv('/content/drive/MyDrive/GitHub/Home/Financial/Data/pos_expenses_data.csv')
df.head()

In [None]:
# define data types for columns
new_df_schema = {
'Location': df['Location'].astype(str),
'Description': df['Description'].astype(str),
'Cost': pd.to_numeric(df['Cost'], errors='coerce'),
'Category': df['Category'].astype(str),
'Day': df['Day'].astype(str),
'Month': df['Month'].astype(str),
'Year': df['Year'].astype(str),
}

# update the data types
df = pd.DataFrame(new_df_schema)

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
features = df.drop('Category', axis=1)
labels = df['Category']

In [None]:
class_names = pd.unique(labels)
print("Classes:", class_names)
print("Number of samples:", len(features))

In [None]:
descriptions = df['Description']

In [None]:
labels = pd.get_dummies(labels)

In [None]:
train_samples, test_samples, train_labels, test_labels = train_test_split(descriptions, labels, test_size=0.2, random_state = 0)

Create Vocabulary Index

In [None]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:10]

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

Load pre-trained GloVe

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
print(int_sequences_input)

Model

In [None]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)

x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Training

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_test = vectorizer(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
model.compile(
    loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_test, y_test))

Evaluation

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
pred = model.predict(x_test) 
pred = np.argmax(pred, axis = 1)[:5] 
label = np.argmax(y_test,axis = 1)[:5] 

print(pred) 
print(label)

Display Results

In [None]:
pred = model.predict(x_test)
pred_df = pd.DataFrame(pred, columns = ['alcohol', 'business',	'clothes',	'education',	'entertainment',	'grocery',	'health', 'hygiene', 'improvement',	'misc',	'rent',	'restaurant',	'supplies',	'transportation',	'utilities'])
pred_df = pred_df.idxmax(axis=1)[:50]

label_df = pd.DataFrame(y_test, columns = ['alcohol', 'business',	'clothes',	'education',	'entertainment',	'grocery',	'health',	'hygiene', 'improvement',	'misc',	'rent',	'restaurant',	'supplies',	'transportation',	'utilities'])
label_df = label_df.idxmax(axis=1)[:50]

test_df = test_samples.reset_index(drop=True)[:50]
compare_df = pd.concat([test_df, pred_df, label_df], axis=1)
compare_df.columns =['Description', 'Predicted', 'Actual']

compare_df

Classify text input

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["bagels"]]
)

class_names[np.argmax(probabilities[0])]

Questions:
Which categories had the worst accuracy? The best?