In [None]:
import os
from google.colab import auth
import gspread
from google.auth import default
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization
from keras.layers import Embedding
from sklearn.model_selection import train_test_split

Preprocessing

In [None]:
# authenticate
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# open worksheet
worksheet = gc.open('Combined_19-21').sheet1

# get_all_values gives a list of rows
rows = worksheet.get_all_values()

# convert to a DataFrame and render
df = pd.DataFrame(rows)
df.columns = df.iloc[0]
df = df.iloc[1:]
df.head()

Unnamed: 0,Location,Date,Description,Cost,Category
1,Burger King,2019-05-15,Dinner,$2.19,restaurant
2,Dennys,2019-05-17,Late night/early morning food for both Sam and I,$24.76,restaurant
3,Dennys,2019-05-17,Tip,$4.95,restaurant
4,Wawa,2019-05-17,Moving Truck fuel,$27.50,transportation
5,Exxon,2019-05-17,Moving Truck fuel,$78.25,transportation


In [None]:
# create a new dataframe
expenses_df = df.copy()

In [None]:
# get rid of ',' and '$'
expenses_df['Cost'] = expenses_df['Cost'].str.replace(',', '')
expenses_df['Cost'] = expenses_df['Cost'].str.replace('$', '')

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# define data types for columns
new_df_schema = {
'Location': expenses_df['Location'].astype(str),
'Date': pd.to_datetime(expenses_df['Date']),
'Description': expenses_df['Description'].astype(str),
'Cost': pd.to_numeric(expenses_df['Cost'], errors='coerce'),
'Category': expenses_df['Category'].astype(str),
}

In [None]:
# update the data types
expenses_df = pd.DataFrame(new_df_schema)

In [None]:
# change NaN to 0
expenses_df['Cost'] = expenses_df['Cost'].fillna(0)

In [None]:
# new dataframe without negatives or zeros
edit_df = expenses_df.copy()

In [None]:
# Get names of indexes for which column Cost is <= 0
indexNames = edit_df[edit_df['Cost'] <= 0].index
# Delete these row indexes from dataFrame
edit_df.drop(indexNames , inplace=True)

In [None]:
clothing = edit_df[edit_df['Category'].str.contains('clothing')]
print(clothing)

Empty DataFrame
Columns: [Location, Date, Description, Cost, Category]
Index: []


In [None]:
# change tax to misc
edit_df['Category'] = edit_df['Category'].str.replace('tax', 'misc')

In [None]:
# change clothing to clothes
edit_df['Category'] = edit_df['Category'].str.replace('clothing', 'clothes')

In [None]:
# change subcategories to grocery
edit_df['Category'] = edit_df['Category'].replace(dict.fromkeys(['alcohol','bread','breakfast','canned','condiments','dairy','grains','hygiene','meat','pasta','produce','snacks'], 'grocery'))

Preparing Data

In [None]:
features = edit_df.drop('Category', 1)
labels = edit_df['Category']

  """Entry point for launching an IPython kernel.


In [None]:
class_names = pd.unique(labels)
print("Classes:", class_names)
print("Number of samples:", len(features))

Classes: ['restaurant' 'transportation' 'improvement' 'misc' 'grocery' 'business'
 'clothes' 'rent' 'utilities' 'supplies' 'entertainment' 'education'
 'health']
Number of samples: 2839


In [None]:
descriptions = edit_df['Description']
print(descriptions)

1                                                 Dinner
2       Late night/early morning food for both Sam and I
3                                                    Tip
4                                      Moving Truck fuel
5                                      Moving Truck fuel
                              ...                       
2925                                            cheerios
2926                                                 tax
2927                                                 tax
2928                                                 gas
2929                                        subscription
Name: Description, Length: 2839, dtype: object


In [None]:
labels = pd.get_dummies(labels)

In [None]:
labels

Unnamed: 0,business,clothes,education,entertainment,grocery,health,improvement,misc,rent,restaurant,supplies,transportation,utilities
1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,0,0,0,0,1,0,0,0,0,0,0,0,0
2926,0,0,0,0,0,0,0,1,0,0,0,0,0
2927,0,0,0,0,0,0,0,1,0,0,0,0,0
2928,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
train_samples, test_samples, train_labels, test_labels = train_test_split(descriptions, labels, test_size=0.2, random_state = 0)

In [None]:
test_samples

1116                  tax
2673                salsa
330                   Tip
1985    honey wheat bread
2357            tortillas
              ...        
1527            tortillas
587                   tip
1473       Lance crackers
2333    popcorn and drink
642     LA transportation
Name: Description, Length: 568, dtype: object

Create Vocabulary Index

In [None]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:12]

['',
 '[UNK]',
 'tax',
 'tip',
 'cheese',
 'cream',
 'dinner',
 'bagels',
 'mix',
 'and',
 'bread',
 'ice']

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

Load pre-trained GloVe

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-05-12 20:29:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-05-12 20:29:21--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-12 20:29:22--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 820 words (69 misses)


In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
print(int_sequences_input)

KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.int64, name='input_8'), name='input_8', description="created by layer 'input_8'")


Model

In [None]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)

x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_4 (Embedding)     (None, None, 100)         89100     
                                                                 
 conv1d_9 (Conv1D)           (None, None, 128)         64128     
                                                                 
 max_pooling1d_6 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 conv1d_10 (Conv1D)          (None, None, 128)         82048     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, None, 128)        0         
 1D)                                                       

Training

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_test = vectorizer(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
x_test

array([[  2,   0,   0, ...,   0,   0,   0],
       [ 26,   0,   0, ...,   0,   0,   0],
       [  3,   0,   0, ...,   0,   0,   0],
       ...,
       [  1,  32,   0, ...,   0,   0,   0],
       [ 87,   9, 156, ...,   0,   0,   0],
       [ 88, 168,   0, ...,   0,   0,   0]])

In [None]:
model.compile(
    loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd7b0fed490>

Evaluation

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.6140957474708557
Test accuracy: 0.8908450603485107


In [None]:
pred = model.predict(x_test) 
pred = np.argmax(pred, axis = 1)[:5] 
label = np.argmax(y_test,axis = 1)[:5] 

print(pred) 
print(label)

[7 4 9 4 4]
[7 4 9 4 4]


Display Results

In [None]:
pred = model.predict(x_test)
pred_df = pd.DataFrame(pred, columns = ['business',	'clothes',	'education',	'entertainment',	'grocery',	'health',	'improvement',	'misc',	'rent',	'restaurant',	'supplies',	'transportation',	'utilities'])
pred_df = pred_df.idxmax(axis=1)[:50]

label_df = pd.DataFrame(y_test, columns = ['business',	'clothes',	'education',	'entertainment',	'grocery',	'health',	'improvement',	'misc',	'rent',	'restaurant',	'supplies',	'transportation',	'utilities'])
label_df = label_df.idxmax(axis=1)[:50]

test_df = test_samples.reset_index(drop=True)[:50]
compare_df = pd.concat([test_df, pred_df, label_df], axis=1)
compare_df.columns =['Description', 'Predicted', 'Actual']

#print(pred_df)
#print(test_df)
print(compare_df)

                                          Description      Predicted  \
0                                                 tax           misc   
1                                               salsa        grocery   
2                                                 Tip     restaurant   
3                                   honey wheat bread        grocery   
4                                           tortillas        grocery   
5                                              Tomato        grocery   
6                                       candle for us  entertainment   
7                                                 tax           misc   
8                                           cheez-its        grocery   
9                                                 tax           misc   
10                                                tax           misc   
11                                     diced tomatoes        grocery   
12                                                tip     restau

Classify text input

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["candle"]]
)

class_names[np.argmax(probabilities[0])]

'clothes'

Questions:
Which categories had the worst accuracy? The best?