# Toto Expenses: Category Predictor

In [346]:
import sys
import pickle

In [347]:
import json
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings

nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
## Data Analysis

In [348]:
# Ingestion of the Expenses in dict objects
def transform_line(line): 
    expense = json.loads(line)
    return expense

In [349]:
# Read the data from the backup file
# Save it into a list
def ingest_data(filename):
    data = []
    
    with open(filename, "r") as file: 
        for idx, line in enumerate(file):
            expense = transform_line(line)
            
            data.append(expense)
    
    return data
        

In [350]:
def to_dataframe(data: list):
    return pd.DataFrame(data)
    

In [351]:
df = to_dataframe(ingest_data("expenses.json"))

In [352]:
df.head(2)

Unnamed: 0,_id,amount,date,category,description,creditMom,creditOther,yearMonth,currency,amountInEuro,user,monthly,subscriptionId,consolidated,cardId,cardMonth,weekendId,cardYear,additionalData,tags
0,583dd4f72ab79c00017d9895,14.35,20161128,SUPERMERCATO,Supermercato (Esselunga),,,201612,EUR,14.35,nicolas.matteazzi@gmail.com,False,,,,,,,,
1,583dd5382ab79c00017d9897,15.0,20161128,VIAGGI,Ricarica Prepagata (Boston hotel),,,201612,EUR,15.0,nicolas.matteazzi@gmail.com,False,,,,,,,,


In [256]:
features = df[df["user"] == "nicolas.matteazzi@gmail.com"][["description", "category"]]
features.tail(5)

Unnamed: 0,description,category
5776,Google Photos,SVAGO
5777,Adobe,SVAGO
5778,Car insurance,AUTO
5779,Apple Tv+,SVAGO
5780,Mc Donald,FOOD


### Vectorizing the Descriptions
The vectorization process basically:
 * splits all descriptions into tokens and 
 * perform **stemming**


In [353]:
stopwords_vocab = set(stopwords.words("english")).union(set(stopwords.words("italian")))
# stopwords_vocab = set(stopwords.words("danish"))
punctuation_symbols = list(string.punctuation)

In [354]:
stemmer = PorterStemmer()

In [355]:
def tokenize(desc): 
    '''
    Tokenizes a description and applies the following:
     - removes stop words
     - removes useless characters (e.g. '-')
     - stems the word
    '''
    # Split descriptions into tokens
    tokens = desc.split()
    
    # Prepare to remove any punctuation in the word
    translation_table = str.maketrans('', '', ''.join(punctuation_symbols))
    
    # Filter out stopwords, remove numeric characters from the word
    tokens = [re.sub(r'\d', '', word.lower().translate(translation_table)) for word in tokens if word.lower() not in stopwords_vocab]
    tokens = [word for word in tokens if word not in punctuation_symbols and not word.isdigit()]
    
    
    # Perform stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join the tokens, as needed by TF-IDF
    tokens = " ".join(tokens)
    
    return tokens

In [356]:
# Create a new column "tokens" that contains the processed descriptions
features.loc[:,"tokens"] = features["description"].apply(tokenize)

In [357]:
range_low = np.random.randint(0, len(features))
features.iloc[range_low:range_low + 10]

Unnamed: 0,description,category,tokens
4594,Gjensidige,AUTO,gjensidig
4595,A-kasse,VARIE,akass
4596,Vuggestue 09.2023,VARIE,vuggestu
4597,Fine hospital,AUTO,fine hospit
4598,Mc Donald,FOOD,mc donald
4599,Supermarket,SUPERMERCATO,supermarket
4600,Benzina,AUTO,benzina
4604,Hovmand heatpump tranche 1,HOME,hovmand heatpump tranch
4605,Hovmand service,HOME,hovmand servic
4606,Panodil pharmacy,SALUTE,panodil pharmaci


In [358]:
# Split the dataset between test and train
train_df, test_df = train_test_split(features, test_size=0.2)

In [359]:
# Vectorize the tokens both for test and train datasets
tfidf_vectorizer = TfidfVectorizer()

X_train = tfidf_vectorizer.fit_transform(train_df["tokens"])

# Here we only transform because we need to consider the test set as if it was 
# a new set of data, not seen by the model
X_test = tfidf_vectorizer.transform(test_df["tokens"])


In [360]:
model = MultinomialNB()
model.fit(X_train, train_df["category"])

In [361]:
# Evaluate the accuracy of the model
# 1. Predict 
Y_test = model.predict(X_test)

# 2. Evaluate
accuracy = accuracy_score(test_df["category"], Y_test)

print(accuracy)

0.7424942263279446


### Trying a different vectorizer

In [362]:
binary_vectorizer = CountVectorizer(binary=True)

X_train = binary_vectorizer.fit_transform(train_df["tokens"])
X_test = binary_vectorizer.transform(test_df["tokens"])


In [363]:
train_df["tokens"]

1630         supermarket føtex
2217                       coc
2005    starbuck coffe hous mo
528                    fastweb
3168                    milano
                 ...          
5008             present peter
1155            parcheggio cph
4721               supermarket
3476               supermarket
3299                       car
Name: tokens, Length: 3464, dtype: object

In [364]:
model = MultinomialNB()
model.fit(X_train, train_df["category"])

In [365]:
# Evaluate the accuracy of the model
# 1. Predict 
Y_test = model.predict(X_test)
Y_train = model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(test_df["category"], Y_test)
train_accuracy = accuracy_score(train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 0.8571
Test accuracy: 0.7575


In [366]:
comparison_df = pd.DataFrame(data={"Tokens": test_df["tokens"], "Expected Category": test_df["category"], "Predicted Category": Y_test})

In [367]:
comparison_df[comparison_df["Expected Category"] != comparison_df["Predicted Category"]].head(50)

Unnamed: 0,Tokens,Expected Category,Predicted Category
1037,paludan,FOOD,AUTO
5515,deliveroo pari,VARIE,VIAGGI
5216,paypal courserainc z,SVAGO,VARIE
4558,movi rental,SVAGO,AUTO
2285,bed,FURNITURE,SUPERMERCATO
5081,insur mobil pay caro,HOME,AUTO
3946,dog food,PET,VARIE
2012,royal arena,USCITE,AUTO
5702,googl cloud sccxg milan,SVAGO,VARIE
300,pranzo ikea,FOOD,FURNITURE


### Trying One-hot encoding

In [368]:
oh_train_df = train_df.copy()

oh_train_df["split_tokens"] = oh_train_df["tokens"].apply(str.split)
oh_train_df["split_tokens"]

1630           [supermarket, føtex]
2217                          [coc]
2005    [starbuck, coffe, hous, mo]
528                       [fastweb]
3168                       [milano]
                   ...             
5008               [present, peter]
1155              [parcheggio, cph]
4721                  [supermarket]
3476                  [supermarket]
3299                          [car]
Name: split_tokens, Length: 3464, dtype: object

In [369]:
def create_vocab(list_of_tokens): 
    """Creates a vocabulary dict of the list of tokens used in the training data. 
    
    The dictionnary is an index dictionnary, that assign to each word an index. 
    The dictionnary is meant to be used for "one-hot encoding" or similar
    
    Returns
     - (dict) where the key is the word and the value is the index
    """
    
    vocab = {}
    
    idx = 1
    for l in list_of_tokens:
        for token in l: 
            if token not in vocab.keys(): 
                vocab[token] = idx
                idx += 1
                
    # Add the UNK word
    vocab["UNK"] = 0
    
    return vocab

In [370]:
vocab = create_vocab(oh_train_df["split_tokens"].values)

In [371]:
def custom_encode(words: list, vocab: dict, unknown_word: str = "UNK"): 
    
    encoded_words = np.zeros(len(vocab))
    
    # Create an encoding of the words
    for word in words: 
        if word in vocab.keys():
            idx = vocab[word]
        else:
            idx = vocab[unknown_word]
        
        encoded_words[idx] = 1
    
    return encoded_words.tolist()
    
    

In [372]:
# Quick test
# custom_encode("parcheggio lampugnano".split(), vocab)

In [373]:
# Apply to training dataframe
oh_train_df["encoded_tokens"] = oh_train_df["split_tokens"].apply(lambda x: custom_encode(x, vocab))

In [374]:
X_train = oh_train_df["encoded_tokens"].values.tolist()
Y_train = oh_train_df["category"].values.tolist()

In [375]:
model = MultinomialNB()
model.fit(X_train, Y_train)

In [376]:
# Evaluate the accuracy of the model
oh_test_df = test_df.copy()
oh_test_df["split_tokens"] = oh_test_df["tokens"].apply(str.split)
oh_test_df["encoded_tokens"] = oh_test_df["split_tokens"].apply(lambda x: custom_encode(x, vocab))

X_test = oh_test_df["encoded_tokens"].values.tolist()

# 1. Predict 
Y_test = model.predict(X_test)
Y_train_pred = model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.7552


### Trying other models (RF, MLP, etc..)

This still uses the custom vectorization

In [380]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [381]:
rf_model = RandomForestClassifier()

rf_model.fit(X_train, Y_train)

In [379]:
# 1. Predict 
Y_test = rf_model.predict(X_test)
Y_train_pred = rf_model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.7829


In [382]:
mlp_model = MLPClassifier(hidden_layer_sizes=(30, 10))

mlp_model.fit(X_train, Y_train)

# 1. Predict 
Y_test = mlp_model.predict(X_test)
Y_train_pred = mlp_model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.8210


It really looks like MLPClassifier has by far the **highest test accuracy** of all the models. <br>
Even though it clearly overfits the training set, it seems to be performing quite ok on the test set. 

We can try to run Grid Search on MLP

In [383]:
from sklearn.model_selection import GridSearchCV

In [None]:
mlp_grid_model = GridSearchCV(MLPClassifier(), param_grid={
    "hidden_layer_sizes": [(30, 10), (50, 20), (100, 50)], 
    "learning_rate_init": [0.001, 0.01], 
    "alpha": [0.0001, 0.01]
})

mlp_grid_model.fit(X_train, Y_train)

# 1. Predict 
Y_test = mlp_grid_model.predict(X_test)
Y_train_pred = mlp_grid_model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")


# Saving the model

In [288]:
my_X = binary_vectorizer.transform(["benzina"])
model.predict(my_X)

array(['AUTO'], dtype='<U12')

In [90]:
with open("expcat-model-cbm", "wb") as f: 
    pickle.dump(model, f)

In [91]:
with open("expcat-vect-cbm", "wb") as  f:
    pickle.dump(binary_vectorizer, f)

In [94]:
stopwords_vocab = set(stopwords.words("danish"))

In [95]:
with open("expcat-stopwords-cbm", "wb") as f: 
    pickle.dump(stopwords_vocab, f)