In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('transactions_dataset.csv')
dataset.head()

Unnamed: 0,Text,CategoryId
0,Lidl )))),2
1,Deposit Rent,1
2,McDonalds Banegaards )))),3
3,McDonalds Banegaards )))),3
4,Burger Shack Horsens )))),3


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
number_of_rows = dataset.shape[0]
for i in range(0, number_of_rows): 
    text = re.sub('[^a-zA-Z0-9]', ' ', dataset['Text'][i]) # use regex to remove all non-alphabetical symbols
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('danish'))]
    text = ' '.join(text)
    corpus.append(text)

# MORE OPTIONS: 
# 1) remove parts of the string which may contain sensitive information (e.g. contract number) or
#    is described in other features (e.g. date, transaction amount)

In [4]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 200)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
acc

0.905511811023622

In [6]:
## Logistic regression classification
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
acc

0.9212598425196851

In [7]:
## Random forest tree classification
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
acc

0.937007874015748

In [8]:
categories = {
    0:'Automobile and Transport',
    1:'Housing and Real-Estate',
    2:'Groceries',
    3:'Recreation and Leisure',
    4:'Health and Well Being',
    5:'Hobby and Knowledge',
    6:'Clothes and Equipment',
    7:'Cash and Credit',
    8:'Financial Services',
    9:'Other'
}

def get_category_by_id(id):
    return categories[id];

In [9]:
inputs = ['lidl', 'netto ))', 'netflix', 'kfc', 'rent']
predictions = classifier.predict(cv.transform(inputs))
{ inputs[id]: get_category_by_id(predictions[id]) for id in range(predictions.size) }

{'kfc': 'Other',
 'lidl': 'Groceries',
 'netflix': 'Other',
 'netto ))': 'Groceries',
 'rent': 'Housing and Real-Estate'}