# Toto Expenses: Category Predictor

In [389]:
import sys
import pickle

In [390]:
import json
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings

nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
## Data Analysis

In [391]:
# Ingestion of the Expenses in dict objects
def transform_line(line): 
    expense = json.loads(line)
    return expense

In [392]:
# Read the data from the backup file
# Save it into a list
def ingest_data(filename):
    data = []
    
    with open(filename, "r") as file: 
        for idx, line in enumerate(file):
            expense = transform_line(line)
            
            data.append(expense)
    
    return data
        

In [393]:
def to_dataframe(data: list):
    return pd.DataFrame(data)
    

In [394]:
df = to_dataframe(ingest_data("expenses.json"))

In [395]:
df.head(2)

Unnamed: 0,_id,amount,date,category,description,creditMom,creditOther,yearMonth,currency,amountInEuro,user,monthly,subscriptionId,consolidated,cardId,cardMonth,weekendId,cardYear,additionalData,tags
0,583dd4f72ab79c00017d9895,14.35,20161128,SUPERMERCATO,Supermercato (Esselunga),,,201612,EUR,14.35,nicolas.matteazzi@gmail.com,False,,,,,,,,
1,583dd5382ab79c00017d9897,15.0,20161128,VIAGGI,Ricarica Prepagata (Boston hotel),,,201612,EUR,15.0,nicolas.matteazzi@gmail.com,False,,,,,,,,


In [396]:
features = df[df["user"] == "nicolas.matteazzi@gmail.com"][["description", "category"]]
features.tail(5)

Unnamed: 0,description,category
5776,Google Photos,SVAGO
5777,Adobe,SVAGO
5778,Car insurance,AUTO
5779,Apple Tv+,SVAGO
5780,Mc Donald,FOOD


### Vectorizing the Descriptions
The vectorization process basically:
 * splits all descriptions into tokens and 
 * perform **stemming**


In [397]:
stopwords_vocab = set(stopwords.words("english")).union(set(stopwords.words("italian")))
# stopwords_vocab = set(stopwords.words("danish"))
punctuation_symbols = list(string.punctuation)

In [398]:
stemmer = PorterStemmer()

In [399]:
def tokenize(desc): 
    '''
    Tokenizes a description and applies the following:
     - removes stop words
     - removes useless characters (e.g. '-')
     - stems the word
    '''
    # Split descriptions into tokens
    tokens = desc.split()
    
    # Prepare to remove any punctuation in the word
    translation_table = str.maketrans('', '', ''.join(punctuation_symbols))
    
    # Filter out stopwords, remove numeric characters from the word
    tokens = [re.sub(r'\d', '', word.lower().translate(translation_table)) for word in tokens if word.lower() not in stopwords_vocab]
    tokens = [word for word in tokens if word not in punctuation_symbols and not word.isdigit()]
    
    
    # Perform stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join the tokens, as needed by TF-IDF
    tokens = " ".join(tokens)
    
    return tokens

In [400]:
# Create a new column "tokens" that contains the processed descriptions
features.loc[:,"tokens"] = features["description"].apply(tokenize)

In [401]:
range_low = np.random.randint(0, len(features))
features.iloc[range_low:range_low + 10]

Unnamed: 0,description,category,tokens
3628,Vitamins,SALUTE,vitamin
3629,Benzina,AUTO,benzina
3630,B Skat,VARIE,b skat
3632,Books Amazon,SVAGO,book amazon
3633,Vacations Cannes Hot,VIAGGI,vacat cann hot
3634,Supermarket,SUPERMERCATO,supermarket
3639,Supermarket,SUPERMERCATO,supermarket
3640,Parking,AUTO,park
3641,Bastard cafe,USCITE,bastard cafe
3654,Insurances,HOME,insur


In [402]:
# Split the dataset between test and train
train_df, test_df = train_test_split(features, test_size=0.2)

In [403]:
# Vectorize the tokens both for test and train datasets
tfidf_vectorizer = TfidfVectorizer()

X_train = tfidf_vectorizer.fit_transform(train_df["tokens"])

# Here we only transform because we need to consider the test set as if it was 
# a new set of data, not seen by the model
X_test = tfidf_vectorizer.transform(test_df["tokens"])


In [404]:
model = MultinomialNB()
model.fit(X_train, train_df["category"])

In [405]:
# Evaluate the accuracy of the model
# 1. Predict 
Y_test = model.predict(X_test)

# 2. Evaluate
accuracy = accuracy_score(test_df["category"], Y_test)

print(accuracy)

0.733256351039261


### Trying a different vectorizer

In [406]:
binary_vectorizer = CountVectorizer(binary=True)

X_train = binary_vectorizer.fit_transform(train_df["tokens"])
X_test = binary_vectorizer.transform(test_df["tokens"])


In [407]:
train_df["tokens"]

4385                  autorout
3155                 bridg fyn
4990                   currenc
1463                       dvd
709      parcheggio lampugnano
                 ...          
5139                          
4439             lunett soleil
1351                     fotex
4045                photo caro
128     supermercato esselunga
Name: tokens, Length: 3464, dtype: object

In [408]:
model = MultinomialNB()
model.fit(X_train, train_df["category"])

In [409]:
# Evaluate the accuracy of the model
# 1. Predict 
Y_test = model.predict(X_test)
Y_train = model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(test_df["category"], Y_test)
train_accuracy = accuracy_score(train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 0.8591
Test accuracy: 0.7517


In [460]:
def check_misclassifications(test_df, Y_test): 
    comparison_df = pd.DataFrame(data={"Description": test_df["description"], "Tokens": test_df["tokens"], "Expected Category": test_df["category"], "Predicted Category": Y_test})
    return comparison_df[comparison_df["Expected Category"] != comparison_df["Predicted Category"]]

### Trying One-hot encoding

In [412]:
oh_train_df = train_df.copy()

oh_train_df["split_tokens"] = oh_train_df["tokens"].apply(str.split)
oh_train_df["split_tokens"]

4385                   [autorout]
3155                 [bridg, fyn]
4990                    [currenc]
1463                        [dvd]
709      [parcheggio, lampugnano]
                  ...            
5139                           []
4439             [lunett, soleil]
1351                      [fotex]
4045                [photo, caro]
128     [supermercato, esselunga]
Name: split_tokens, Length: 3464, dtype: object

In [413]:
def create_vocab(list_of_tokens): 
    """Creates a vocabulary dict of the list of tokens used in the training data. 
    
    The dictionnary is an index dictionnary, that assign to each word an index. 
    The dictionnary is meant to be used for "one-hot encoding" or similar
    
    Returns
     - (dict) where the key is the word and the value is the index
    """
    
    vocab = {}
    
    idx = 1
    for l in list_of_tokens:
        for token in l: 
            if token not in vocab.keys(): 
                vocab[token] = idx
                idx += 1
                
    # Add the UNK word
    vocab["UNK"] = 0
    
    return vocab

In [414]:
vocab = create_vocab(oh_train_df["split_tokens"].values)

In [415]:
def custom_encode(words: list, vocab: dict, unknown_word: str = "UNK"): 
    
    encoded_words = np.zeros(len(vocab))
    
    # Create an encoding of the words
    for word in words: 
        if word in vocab.keys():
            idx = vocab[word]
        else:
            idx = vocab[unknown_word]
        
        encoded_words[idx] = 1
    
    return encoded_words.tolist()
    
    

In [416]:
# Quick test
# custom_encode("parcheggio lampugnano".split(), vocab)

In [417]:
# Apply to training dataframe
oh_train_df["encoded_tokens"] = oh_train_df["split_tokens"].apply(lambda x: custom_encode(x, vocab))

In [418]:
X_train = oh_train_df["encoded_tokens"].values.tolist()
Y_train = oh_train_df["category"].values.tolist()

In [419]:
model = MultinomialNB()
model.fit(X_train, Y_train)

In [420]:
# Evaluate the accuracy of the model
oh_test_df = test_df.copy()
oh_test_df["split_tokens"] = oh_test_df["tokens"].apply(str.split)
oh_test_df["encoded_tokens"] = oh_test_df["split_tokens"].apply(lambda x: custom_encode(x, vocab))

X_test = oh_test_df["encoded_tokens"].values.tolist()

# 1. Predict 
Y_test = model.predict(X_test)
Y_train_pred = model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.7529


### Trying other models (RF, MLP, etc..)

This still uses the custom vectorization

In [421]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [453]:
rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(X_train, Y_train)

# 1. Predict 
Y_test = rf_model.predict(X_test)
Y_train_pred = rf_model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.8106


In [450]:
mlp_model = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=23, alpha=0.1)

mlp_model.fit(X_train, Y_train)

# 1. Predict 
Y_test = mlp_model.predict(X_test)
Y_train_pred = mlp_model.predict(X_train)

# 2. Evaluate
accuracy = accuracy_score(oh_test_df["category"], Y_test)
train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Training accuracy: 1.0000
Test accuracy: 0.8187


It really looks like MLPClassifier has by far the **highest test accuracy** of all the models. <br>
Even though it clearly overfits the training set, it seems to be performing quite ok on the test set. 

We can try to run Grid Search on MLP

In [425]:
from sklearn.model_selection import GridSearchCV

In [442]:
def fine_tune_mlp(): 
    mlp_grid_model = GridSearchCV(
        MLPClassifier(
            max_iter=500, 
            hidden_layer_sizes=(50, 25), 
            alpha=0.0001, 
            random_state=23
        ), 
        param_grid={
            #"hidden_layer_sizes": [(50, 20), (50, 25), (60, 25)],
            "alpha": [0.0001, 0.001, 0.01]
        }
    )

    mlp_grid_model.fit(X_train, Y_train)

    # 1. Predict 
    Y_test = mlp_grid_model.predict(X_test)
    Y_train_pred = mlp_grid_model.predict(X_train)

    # 2. Evaluate
    accuracy = accuracy_score(oh_test_df["category"], Y_test)
    train_accuracy = accuracy_score(oh_train_df["category"], Y_train)

    print(f"Training accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {accuracy:.4f}")
    
    print(f"Best Parameters: {mlp_grid_model.best_params_}")

    return mlp_grid_model.best_estimator_

In [441]:
# fine_tune_mlp()

Training accuracy: 1.0000
Test accuracy: 0.8141
Best Parameters: {'alpha': 0.0001}


In [462]:
comparison = check_misclassifications(oh_test_df, Y_test)
comparison.iloc[0:50]

Unnamed: 0,Description,Tokens,Expected Category,Predicted Category
4583,H&M,hm,CLOTHES,VARIE
3782,Caroline house expenses,carolin hous expens,HOME,XMAS
3147,Pesi fitness,pesi fit,PALESTRA,VARIE
5529,LA HUNESTGERMAIN PARIS 6,hunestgermain pari,VIAGGI,VARIE
4915,Baby car seat,babi car seat,CHILD,AUTO
359,Prelievo Bancomat Orio,prelievo bancomat orio,VIAGGI,VARIE
2389,BURGER KING �09866 Q07\ \MI...,burger king � q mi,VIAGGI,FOOD
4757,Thermometer,thermomet,SALUTE,VARIE
3203,Raccomandata per grondaia,raccomandata grondaia,HOME,VARIE
5500,LIM RIDE COST DUBLIN,lim ride cost dublin,VARIE,VIAGGI


# Saving the model

In [288]:
my_X = binary_vectorizer.transform(["benzina"])
model.predict(my_X)

array(['AUTO'], dtype='<U12')

In [90]:
with open("expcat-model-cbm", "wb") as f: 
    pickle.dump(model, f)

In [91]:
with open("expcat-vect-cbm", "wb") as  f:
    pickle.dump(binary_vectorizer, f)

In [94]:
stopwords_vocab = set(stopwords.words("danish"))

In [95]:
with open("expcat-stopwords-cbm", "wb") as f: 
    pickle.dump(stopwords_vocab, f)