In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
import pandas as pd

transactions = pd.read_csv('synthetic_tax_transactions.csv')

In [2]:
transactions.head()

Unnamed: 0,transaction_id,user_id,date,amount,currency,category,description,tax_deductible,merchant,payment_method,receipt_available,country
0,f56cb713-700c-4b83-b45c-e23f5588350e,1181,2024-08-02,167.33,USD,Marketing & Advertising,Billboard advertisement for company launch,1,Google,Crypto,1,France
1,39afe434-64fb-43b5-a720-d70e6c07c37e,1494,2024-10-01,10.15,EUR,Education & Training,Online course on machine learning,1,Microsoft,Credit Card,0,France
2,0cca614a-e7f7-42ab-9129-c6ced844af4d,1398,2024-06-15,56.26,USD,Rent & Utilities,Monthly office rent payment,1,Uber,Bank Transfer,0,USA
3,91a47d48-f6d7-4a14-b7a0-f01cfd2c53fa,1112,2025-01-01,177.22,EUR,Software & Tools,Annual license for antivirus software,1,Netflix,Bank Transfer,0,France
4,ca0c21c2-e50a-4cf1-8ba1-80e449cb34c9,1570,2024-05-16,496.48,EUR,Education & Training,Online course on machine learning,1,Google,Bank Transfer,1,Canada


In [3]:
transactions.shape

(1000, 12)

In [4]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   transaction_id     1000 non-null   object 
 1   user_id            1000 non-null   int64  
 2   date               1000 non-null   object 
 3   amount             1000 non-null   float64
 4   currency           1000 non-null   object 
 5   category           1000 non-null   object 
 6   description        1000 non-null   object 
 7   tax_deductible     1000 non-null   int64  
 8   merchant           1000 non-null   object 
 9   payment_method     1000 non-null   object 
 10  receipt_available  1000 non-null   int64  
 11  country            1000 non-null   object 
dtypes: float64(1), int64(3), object(8)
memory usage: 93.9+ KB


In [5]:
transactions.describe()

Unnamed: 0,user_id,amount,tax_deductible,receipt_available
count,1000.0,1000.0,1000.0,1000.0
mean,1499.897,253.64969,0.643,0.709
std,295.222022,143.15279,0.479355,0.454451
min,1000.0,5.32,0.0,0.0
25%,1238.5,127.405,0.0,0.0
50%,1493.5,255.09,1.0,1.0
75%,1756.0,377.4375,1.0,1.0
max,1999.0,499.4,1.0,1.0


In [6]:
transactions.describe(include='object')

Unnamed: 0,transaction_id,date,currency,category,description,merchant,payment_method,country
count,1000,1000,1000,1000,1000,1000,1000,1000
unique,1000,349,3,11,55,9,5,6
top,4c503d1b-8145-4d11-bd45-6ca30eebd1a6,2024-11-18,GBP,Education & Training,New smartphone purchase,Airbnb,Crypto,Germany
freq,1,8,345,100,26,126,221,183


In [23]:
transactions.isnull().sum()

transaction_id       0
user_id              0
date                 0
amount               0
currency             0
category             0
description          0
tax_deductible       0
merchant             0
payment_method       0
receipt_available    0
country              0
dtype: int64

In [8]:
texts = transactions["description"].astype(str).tolist()
labels = transactions["tax_deductible"].astype(int).values  # 1 = Deductible, 0 = Non-Deductible

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post')


model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(1, activation="sigmoid"),
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train Model
model.fit(X, labels, epochs=10, batch_size=32, validation_split=0.2)

# Save Model
model.save("tax_classifier.h5")

# Save tokenizer for later use
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.5981 - loss: 0.6828 - val_accuracy: 0.6600 - val_loss: 0.6152
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6402 - loss: 0.5634 - val_accuracy: 0.9750 - val_loss: 0.2039
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9837 - loss: 0.1604 - val_accuracy: 0.9850 - val_loss: 0.0991
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9892 - loss: 0.0862 - val_accuracy: 0.9850 - val_loss: 0.0513
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0372 - val_accuracy: 1.0000 - val_loss: 0.0225
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.0204 - val_accuracy: 1.0000 - val_loss: 0.0127
Epoch 7/10
[1m25/25[0m [32m━━━━



In [None]:
import tensorflow as tf
import pickle
import numpy as np
import functions_framework

# Load Model and Tokenizer
model = tf.keras.models.load_model("tax_classifier.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

@functions_framework.http
def predict(request):
    """Predict if a transaction is deductible."""
    request_json = request.get_json()
    description = request_json["description"]
    
    sequence = tokenizer.texts_to_sequences([description])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=10)
    prediction = model.predict(padded_sequence)[0][0]
    
    return {"deductible": bool(prediction > 0.5)}