In [1]:
import pandas as pd 
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics import accuracy_score
# evaluation
from sklearn.model_selection import KFold
from statistics import mean

In [2]:
# load the csv-file into dataframe df
csv_data_file = 'Exercise 1 - Transaction Classification - Data Set.csv'
df = pd.read_csv(csv_data_file, sep=';')

In [3]:
# output the first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,Auftragskonto,Buchungstag,Valutadatum,Buchungstext,Verwendungszweck,Beguenstigter/Zahlungspflichtiger,Kontonummer,BLZ,Betrag,Waehrung,label
0,0,89990201.0,28.07.2016,28.07.2016,Lohn / Gehalt,Gehalt Adorsys GmbH & Co. KG End-To-End-Ref.: ...,Adorsys GmbH & Co. KG,7807800780,25190001,2000.0,EUR,income
1,1,89990201.0,27.07.2016,27.07.2016,Miete,Byladem1Sbt De12773501123456789889 Miete Beuth...,Georg Tasche,DE31251900019123456780,VOHADE2HXXX,-670.0,EUR,living
2,2,89990201.0,21.07.2016,21.07.2016,Bargeld,"21.07/16.34Uhr Nuernberg All Eur 70,00 Geb.Eur...",Bargeld,9999900780,25190001,-70.0,EUR,private
3,3,89990201.0,20.07.2016,20.07.2016,Lebensmittel / Getraenke,2831 Edeka Neubauer Nuernb.//Nuernb 2016-07-20...,Kartenzahlung,9736000780,25190001,-73.21,EUR,standardOfLiving
4,4,89990201.0,18.07.2016,18.07.2016,Spontanausgabe,Amazon,neue Playstation,9988776655,25125100,-363.0,EUR,leisure


In [4]:
# distribution of categories/labels
print(f"label{' ' * 15}count")
print(f"{'-' * 25}")
print(f"{df.label.value_counts()}")


label               count
-------------------------
leisure             65
standardOfLiving    47
finance             33
living              26
private             21
income              17
Name: label, dtype: int64


In [5]:
# df['accumulated_data'] => accumalated columns 'Buchungstext', 'Verwendungszweck'and 'Beguenstigter/Zahlungspflichtiger'
# df['preprocessed_data'] => df['accumulated_data']; no numbers / no digit-characters
df['accumulated_data'] = df[['Buchungstext', 'Verwendungszweck', 'Beguenstigter/Zahlungspflichtiger']]. \
    apply(lambda x: ' '.join(x), axis=1)
pattern = re.compile('\d')
df['preprocessed_data'] = df['accumulated_data'].apply(lambda x: pattern.sub('', x))

In [6]:
# CountVectorizer with df['accumulated_data'] (no preprocessing)
# no words to be left out

count_vectorizer = CountVectorizer(lowercase=True)
count_vectorizer.fit(df['accumulated_data'])
# 423 features
print(len(count_vectorizer.get_feature_names()))
# features (the corresponding words)
print(count_vectorizer.get_feature_names())

423
['00', '000', '0000000000', '001', '0084632', '00eur0', '01', '016', '0164378887929309', '01e', '01t06', '01t11', '01t17', '02', '02t13', '02t14', '02t17', '02t22', '03', '03t12', '03t16', '04', '04032016', '04t17', '05', '0580631', '05t09', '05t23', '06', '07', '07folgenr', '08', '09', '09t07', '09t17', '10', '100', '10103084736793zalando', '10316011', '10355311', '10355499', '10355735', '10355942', '10355969', '10355999', '10t17', '11', '1137', '116', '12', '12272140', '122721405', '13', '13t', '14', '140', '14t12', '15', '150244534', '150251073', '150257651', '15t17', '16', '161080109883418', '1612', '162146', '1637782719640441', '16t17', '17', '1704', '18', '18t21', '19', '19t15', '19t17', '1u1', '20', '2001136', '201', '2016', '2020', '2099', '20t15', '20t17', '21', '2103', '22', '2202002154', '220200215414', '23', '230320211004393201252017040elv6520', '23t11', '23t13', '24', '2410573888643087', '2411354884929964', '24t12', '25', '26', '26folgenr', '26t15', '27', '279', '27fol

In [7]:
# CountVectorizer with df['preprocessed_data'] (numbers eliminaded)
# 16 stop words to be left out
stop_words = ['aeu', 'all', 'bylademsbt', 'blz', 'bp', 'ccbadexxx', 'ga', 'gaa', \
              'geb', 'rfalld', 'spdudexxx', 'sskndexxx', 'to', 'ts', 'ue', 'vag']
count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)
count_vectorizer.fit(df['preprocessed_data'])
# 221 features
print(len(count_vectorizer.get_feature_names()))
# features (the corresponding words)
print(count_vectorizer.get_feature_names())

221
['abonnement', 'abschlag', 'abschluss', 'adorsys', 'aenderungen', 'ag', 'alld', 'amazon', 'ankenvers', 'anna', 'apotheke', 'aramark', 'ard', 'atm', 'august', 'aus', 'ausland', 'auslandsrei', 'auszahlung', 'bank', 'bankhaus', 'bar', 'bargeld', 'bears', 'beitrags', 'beitragsservice', 'belastung', 'bequ', 'beuthener', 'bil', 'billpay', 'bistro', 'breuninger', 'cafe', 'california', 'ccb', 'celona', 'christina', 'city', 'classic', 'co', 'comix', 'company', 'dank', 'danke', 'dankt', 'dauerauftrag', 'de', 'deco', 'deutsche', 'deutschland', 'dietrich', 'digital', 'dm', 'doris', 'dradio', 'drogerie', 'drogeriemarkt', 'edeka', 'eg', 'einfach', 'einmalsparen', 'einzahlung', 'einzugsermächtigung', 'elv', 'em', 'end', 'erg', 'esso', 'eu', 'eur', 'euro', 'europe', 'euroscheck', 'fein', 'fil', 'folgenr', 'foodora', 'friends', 'fuerth', 'fuerthermare', 'ganz', 'gas', 'gastro', 'gastronomie', 'gastst', 'gaststaette', 'gehalt', 'geldanlage', 'georg', 'geschenk', 'getraenke', 'glas', 'gmb', 'gmbh', '

In [8]:
# transform df['preprocessed_data'] to vector (necessary to train with)
vectorized_messages = count_vectorizer.transform(df['preprocessed_data']).toarray()

In [9]:
# train the Gaussian Naive Bayes Classifier with the full data set
gnb = GaussianNB()
gnb = gnb.fit(vectorized_messages, df['label'])

In [10]:
# basic test with full set as training set and full set as test set
y_test = gnb.predict(count_vectorizer.transform(df['preprocessed_data']).toarray())
y_real = df['label'].tolist()
print(f"accuracy_score: {accuracy_score(y_test, y_real)}")

accuracy_score: 1.0


In [11]:
# [evalA]: preprocessed, omitting stop words
# evaluation using k-fold with k=10
X = df['preprocessed_data']
y = df['label']
kf = KFold(n_splits=10, shuffle=False)

# 16 stop words to omit
stop_words = ['aeu', 'all', 'bylademsbt', 'blz', 'bp', 'ccbadexxx', 'ga', 'gaa', \
              'geb', 'rfalld', 'spdudexxx', 'sskndexxx', 'to', 'ts', 'ue', 'vag']

accuracy_scores = []

for ind, data in enumerate(kf.split(X)):
    train_index, test_index = data 
    print(f"{str(ind + 1).rjust(2)}/{kf.n_splits}:\t", end='')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)
    count_vectorizer.fit(X_train)
    vectorized_messages = count_vectorizer.transform(X_train).toarray()
    gnb = GaussianNB()
    gnb = gnb.fit(vectorized_messages, y_train)
    result = gnb.predict(count_vectorizer.transform(X_test).toarray())
    acc_score = accuracy_score(result, y_test.tolist())
    print(acc_score)
    accuracy_scores.append(acc_score)

print(f"mean:\t{mean(accuracy_scores)}")

 1/10:	1.0
 2/10:	1.0
 3/10:	0.8095238095238095
 4/10:	0.8095238095238095
 5/10:	0.7619047619047619
 6/10:	0.8095238095238095
 7/10:	0.9047619047619048
 8/10:	0.9047619047619048
 9/10:	1.0
10/10:	1.0
mean:	0.9


In [12]:
# [evalB]: no-preprocessing, no stop words
# evaluation using k-fold with k=10
X = df['accumulated_data']
y = df['label']
kf = KFold(n_splits=10, shuffle=False)

accuracy_scores = []

for ind, data in enumerate(kf.split(X)):
    train_index, test_index = data 
    print(f"{str(ind + 1).rjust(2)}/{kf.n_splits}:\t", end='')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    count_vectorizer = CountVectorizer(lowercase=True)
    count_vectorizer.fit(X_train)
    vectorized_messages = count_vectorizer.transform(X_train).toarray()
    gnb = GaussianNB()
    gnb = gnb.fit(vectorized_messages, y_train)
    result = gnb.predict(count_vectorizer.transform(X_test).toarray())
    acc_score = accuracy_score(result, y_test.tolist())
    print(acc_score)
    accuracy_scores.append(acc_score)

print(f"mean:\t{mean(accuracy_scores)}")

 1/10:	1.0
 2/10:	1.0
 3/10:	0.8095238095238095
 4/10:	0.8571428571428571
 5/10:	0.8095238095238095
 6/10:	0.9047619047619048
 7/10:	0.8571428571428571
 8/10:	1.0
 9/10:	1.0
10/10:	1.0
mean:	0.9238095238095239


In [13]:
# [evalC]: preprocessing, omitting stop words (including 'eu', 'eur' and 'euro')
# evaluation using k-fold with k=10
X = df['preprocessed_data']
y = df['label']
kf = KFold(n_splits=10, shuffle=False)

# 19 stop words to omit
stop_words = ['aeu', 'all', 'bylademsbt', 'blz', 'bp', 'ccbadexxx', 'eu', 'eur', 'euro', \
              'ga', 'gaa', 'geb', 'rfalld', 'spdudexxx', 'sskndexxx', 'to', 'ts', 'ue', 'vag']

accuracy_scores = []

for ind, data in enumerate(kf.split(X)):
    train_index, test_index = data 
    print(f"{str(ind + 1).rjust(2)}/{kf.n_splits}:\t", end='')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)
    count_vectorizer.fit(X_train)
    vectorized_messages = count_vectorizer.transform(X_train).toarray()
    gnb = GaussianNB()
    gnb = gnb.fit(vectorized_messages, y_train)
    result = gnb.predict(count_vectorizer.transform(X_test).toarray())
    acc_score = accuracy_score(result, y_test.tolist())
    print(acc_score)
    accuracy_scores.append(acc_score)

print(f"mean:\t{mean(accuracy_scores)}")

 1/10:	1.0
 2/10:	1.0
 3/10:	0.7619047619047619
 4/10:	0.8095238095238095
 5/10:	0.7619047619047619
 6/10:	0.7142857142857143
 7/10:	0.9047619047619048
 8/10:	0.8095238095238095
 9/10:	1.0
10/10:	1.0
mean:	0.8761904761904762


In [14]:
# investigating the worst split of [evalC] 
train_index, test_index = list(kf.split(X))[5]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)
count_vectorizer.fit(X_train)
vectorized_messages = count_vectorizer.transform(X_train).toarray()
gnb = GaussianNB()
gnb = gnb.fit(vectorized_messages, y_train)
result = gnb.predict(count_vectorizer.transform(X_test).toarray())
acc_score = accuracy_score(result, y_test.tolist())
print(acc_score)
from sklearn.metrics import confusion_matrix
print(df.label.unique())
print(confusion_matrix(y_test, result, labels=df.label.unique()))

0.7142857142857143
['income' 'living' 'private' 'standardOfLiving' 'leisure' 'finance']
[[1 0 0 0 0 0]
 [0 3 0 1 0 0]
 [0 0 1 0 0 0]
 [0 0 0 3 0 0]
 [0 1 0 4 7 0]
 [0 0 0 0 0 0]]
