In [91]:
import pandas as pd

#### Data sample: _Banca Transilvania transaction report_

In [92]:
df = pd.read_csv(r"../data/sample_transactions.csv")

In [None]:
df.head()

#### Data processing

In [94]:
df["Balance"] = df["Balance"].str.replace(",", "").astype(float)
df["Transaction date"] = pd.to_datetime(df["Transaction date"], format="%d-%m-%Y")

In the transaction statement multiple types of transactions are specified (e.g., incoming, Round-Up etc.). Keep only transactions that represent payments from the current account.

In [97]:
df_payments = df.loc[df["Description"].str.contains("Plata")].copy()

In [None]:
df_payments.head()

In [None]:
print("Sample description 1")
df_payments["Description"][0].split(";")

In [None]:
print("Sample description 2")
df_payments["Description"][4].split(";")

In [101]:
def keep_first_occurence(string):
    return " ".join(set(string.split()))

In [102]:
redundant_words = ["RON", "RO", "RRN", "TID", "valoare", "tranzactie", "comision"]

The transaction description contains different specifications separated by semicolon. We'll be keeping the second section for now, since it contains the merchant name and some additional, related, details. <br>
After this processing step, we'll only keep letters, keep only first word occurence for each word, replace multiple spaces with a single one and finally strip the text.

In [103]:
df_payments["Description clean"] = df_payments["Description"].str.split(";", expand=True)[1]
df_payments["Description clean"] = df_payments["Description clean"].str.replace("[^a-zA-Z]", " ", regex=True)
df_payments["Description clean"] = df_payments["Description clean"].apply(keep_first_occurence)
for word in redundant_words:
    df_payments["Description clean"] = df_payments["Description clean"].str.replace(word, "", regex=True)
df_payments["Description clean"] = df_payments["Description clean"].str.replace(" +", " ", regex=True)
df_payments["Description clean"] = df_payments["Description clean"].str.strip()

In [None]:
df_payments["Description clean"].tolist()