In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
import pandas as pd
import functions_framework

transactions = pd.read_csv('../../data/freelancer_tax_deductions.csv')

In [3]:
transactions.head()

Unnamed: 0,transaction_id,user_id,date,amount,currency,category,description,deduction_rate,max_limit,merchant,payment_method,country,tax_deductible
0,5280833a-4da9-416e-92f3-713ae32f63b2,101,2024-10-21,7923.67,EUR,Marketing & Advertising,Google Ads campaign,1.0,10000,Uber,Bank Transfer,Germany,True
1,23d051ef-6839-40f6-bf22-e3b381043aa3,101,2024-09-09,6105.9,EUR,Work Equipment,Ergonomic keyboard and mouse,0.5,5000,Microsoft,Bank Transfer,USA,False
2,7f48d640-6680-4b2f-bae5-a5c827644d72,101,2024-08-17,686.59,USD,Internet & Phone,Mobile data plan for work calls,0.4,2500,Amazon,PayPal,Australia,True
3,1ebba401-ff6a-4112-9703-a06e6a98d851,101,2024-10-10,1095.94,USD,Client Entertainment,Dinner with potential client,0.2,1500,Netflix,Bank Transfer,Canada,True
4,dbcff318-0298-4ed3-9b6f-2d73adbecf50,101,2025-02-15,3664.11,EUR,Internet & Phone,Business internet subscription,0.4,2500,Microsoft,Cash,France,False


In [4]:
transactions.shape

(200, 13)

In [5]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  200 non-null    object 
 1   user_id         200 non-null    int64  
 2   date            200 non-null    object 
 3   amount          200 non-null    float64
 4   currency        200 non-null    object 
 5   category        200 non-null    object 
 6   description     200 non-null    object 
 7   deduction_rate  200 non-null    float64
 8   max_limit       200 non-null    int64  
 9   merchant        200 non-null    object 
 10  payment_method  200 non-null    object 
 11  country         200 non-null    object 
 12  tax_deductible  200 non-null    bool   
dtypes: bool(1), float64(2), int64(2), object(8)
memory usage: 19.1+ KB


In [6]:
transactions.describe()

Unnamed: 0,user_id,amount,deduction_rate,max_limit
count,200.0,200.0,200.0,200.0
mean,101.0,4489.02755,0.58475,5360.0
std,0.0,4531.86891,0.310341,3834.89663
min,101.0,24.52,0.2,1500.0
25%,101.0,1448.53,0.3,3000.0
50%,101.0,2907.72,0.5,4000.0
75%,101.0,5955.4725,1.0,5000.0
max,101.0,22002.24,1.0,15000.0


In [7]:
transactions.describe(include='object')

Unnamed: 0,transaction_id,date,currency,category,description,merchant,payment_method,country
count,200,200,200,200,200,200,200,200
unique,200,156,3,11,36,9,5,6
top,5280833a-4da9-416e-92f3-713ae32f63b2,2024-04-28,EUR,Transportation,SEO service for website optimization,Microsoft,Bank Transfer,Canada
freq,1,3,75,24,12,31,44,47


In [8]:
transactions.isnull().sum()

transaction_id    0
user_id           0
date              0
amount            0
currency          0
category          0
description       0
deduction_rate    0
max_limit         0
merchant          0
payment_method    0
country           0
tax_deductible    0
dtype: int64

In [9]:
texts = transactions["description"].astype(str).tolist()
labels = transactions["tax_deductible"].astype(int).values  # 1 = Deductible, 0 = Non-Deductible

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post')


model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(1, activation="sigmoid"),
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train Model
model.fit(X, labels, epochs=10, batch_size=32, validation_split=0.2)

# Save Model
model.save("tax_classifier.h5")

# Save tokenizer for later use
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 190ms/step - accuracy: 0.5535 - loss: 0.6926 - val_accuracy: 0.5500 - val_loss: 0.6921
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5811 - loss: 0.6902 - val_accuracy: 0.5500 - val_loss: 0.6913
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5516 - loss: 0.6892 - val_accuracy: 0.5500 - val_loss: 0.6907
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5750 - loss: 0.6855 - val_accuracy: 0.5500 - val_loss: 0.6904
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5733 - loss: 0.6827 - val_accuracy: 0.5500 - val_loss: 0.6909
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5793 - loss: 0.6779 - val_accuracy: 0.5500 - val_loss: 0.6925
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━



# Used for deploying

In [None]:
# Load Model and Tokenizer
model = tf.keras.models.load_model("tax_classifier.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

@functions_framework.http
def predict(request):
    request_json = request.get_json()
    description = request_json["description"]
    
    sequence = tokenizer.texts_to_sequences([description])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=10)
    prediction = model.predict(padded_sequence)[0][0]
    
    return {"deductible": bool(prediction > 0.5)}



# Used locally

In [15]:
def predict_deductible(description):
    """Predict if a transaction is deductible."""
    sequence = tokenizer.texts_to_sequences([description])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=10)
    prediction = model.predict(padded_sequence)[0][0]
    return bool(prediction > 0.5)

In [None]:
result = predict_deductible("Business lunch with a client")
print(result)  


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464ms/step
True
