# Toto Expenses: Category Predictor

In [9]:
import sys

In [77]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
## Data Analysisstopwords

In [14]:
with open("ml-notebooks/toto/expenses.json") as f: 
    data = json.load(f)

In [15]:
df = pd.json_normalize(data["expenses"])

In [16]:
df.head(2)

Unnamed: 0,id,amount,category,date,description,yearMonth,consolidated,cardId,cardMonth,cardYear,currency,amountInEuro,additionalData,user,monthly,additionalData.supermarketListId,additionalData.source,additionalData.monthId,weekendId
0,64cf2dcde8c58d629bc89789,208.59,VIAGGI,20230827,Restaurant,202306,True,,,,DKK,27.1167,,nicolas.matteazzi@gmail.com,,,,,
1,64eae160abac381fca5d45bf,49.0,SVAGO,20230827,Movie rental,202308,False,,,,DKK,6.37,,nicolas.matteazzi@gmail.com,,,,,


In [17]:
features = df[["description", "category"]]
features.head(20)

Unnamed: 0,description,category
0,Restaurant,VIAGGI
1,Movie rental,SVAGO
2,Supermarket,SUPERMERCATO
3,Supermarket,SUPERMERCATO
4,Supermarket,SUPERMERCATO
5,Mc donald,FOOD
6,B-SKAT 08.2023,VARIE
7,Car tax,AUTO
8,Benzina,AUTO
9,Mc donald,FOOD


### Vectorizing the Descriptions
The vectorization process basically:
 * splits all descriptions into tokens and 
 * perform **stemming**
 
An example:
```
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
```


In [50]:
stopwords_vocab = set(stopwords.words("english")).union(set(stopwords.words("italian"))).union(set(stopwords.words("french"))).union(set(stopwords.words("danish")))
punctuation_symbols = list(string.punctuation)

In [58]:
stemmer = PorterStemmer()

In [69]:
def tokenize(desc): 
    '''
    Tokenizes a description and applies the following:
     - removes stop words
     - removes useless characters (e.g. '-')
     - stems the word
    '''
    # Split descriptions into tokens
    tokens = desc.split()
    
    # Filter out stopwords
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stopwords_vocab]
    filtered_tokens = [word for word in filtered_tokens if word not in punctuation_symbols]
    
    # Perform stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Join the tokens, as needed by TF-IDF
    joined_tokens = " ".join(stemmed_tokens)
    
    return joined_tokens

In [71]:
features["tokens"] = features["description"].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["tokens"] = features["description"].apply(tokenize)


In [80]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(features["tokens"])

In [81]:
X_train

<3272x1321 sparse matrix of type '<class 'numpy.float64'>'
	with 5875 stored elements in Compressed Sparse Row format>

In [83]:
model = MultinomialNB()
model.fit(X_train, features["category"])

In [103]:
X_test = tfidf_vectorizer.transform(["super", "vacanza lido barcellona", "supermarket alghero", "electricity bill", "hotel", "pappa loki", "food loki", "skat"])
model.predict(X_test)

array(['SUPERMERCATO', 'VIAGGI', 'SUPERMERCATO', 'HOME', 'VIAGGI',
       'VARIE', 'VARIE', 'VARIE'], dtype='<U12')