In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Read the Excel file into a pandas DataFrame
df = pd.read_excel('dataset.xlsx')
df['KOSTENRUBRIEK - declared'] = df['KOSTENRUBRIEK - declared'].str.lower()
df['FLC: REDEN VERWERPING'] = df['FLC: REDEN VERWERPING'].str.lower()
df['BESCHRIJVING DECLARATIE'] = df['BESCHRIJVING DECLARATIE'].str.lower()

Unique flags

In [25]:
# Extract unique flags from the "FLC: REDEN VERWERPING" column
all_flags = df["FLC: REDEN VERWERPING"].str.split('|', expand=True).stack().str.strip().unique()

# Print the unique flags
print("Unique Flags:")
for flag in all_flags:
    print(flag)

Unique Flags:
the cost was declared in the wrong category
double declaration
please specify the correct invoiced amount
costs that are declared too late, can only be reimbursed for 50%
purchase not eligible for subsidy
please specify the correct supplier


In [26]:
# Define the phrases to check for in the specified column
phrases = {
    "FLC: REDEN VERWERPING": {
        "double declaration": "double",
        "too late": "late",
        "correct invoiced amount": "amount",
        "supplier": "supplier",
        "wrong category": "category",
        "eligible": "eligible"
    }
}

# Iterate through the phrases and update the corresponding columns
for column_name, conditions in phrases.items():
    for phrase, new_column in conditions.items():
        df[new_column] = df[column_name].str.contains(phrase, case=False, na=False).astype(int)

Too late

In [37]:
# Convert the "DATUM FACTUUR - DECLARED" column to datetime format
df['DATUM FACTUUR - DECLARED'] = pd.to_datetime(df['DATUM FACTUUR - DECLARED'], errors='coerce')

# Create the "latest" column by moving the month 6 months forward
df['latest'] = df['DATUM FACTUUR - DECLARED'] + pd.DateOffset(months=6)

# Convert date columns to datetime objects
df['latest'] = pd.to_datetime(df['latest'])
df['DECLARATIEDATUM (can be assumed to be close to payment date)'] = pd.to_datetime(df['DECLARATIEDATUM (can be assumed to be close to payment date)'])

# Calculate the time difference and create a new column 'timediff'
df['pred.late'] = (df['DECLARATIEDATUM (can be assumed to be close to payment date)'] - df['latest']).dt.days > 0
df['pred.late'] = df['pred.late'].astype(int)

Wrong amount (only EUR works)

In [41]:
df['pred.amount'] = (df['BETAALD BEDRAG - extracted from invoice'] - df['BETAALD BEDRAG - declared ']).apply(lambda x: 1 if x < 0 else 0)

Wrong supplier

In [42]:
df['pred.supplier'] = (df['LEVERANCIER - EXTRACTED'] != df['LEVERANCIER - DECLARED']).astype(int)

Wrong category

In [43]:
# Filter rows where "category" column has value 0
df2 = df[df['category'] == 0]

# Save the filtered DataFrame to a CSV file
df2.to_csv('cat_data.csv', index=False)

# Train model on positive targets
X = df2['BESCHRIJVING DECLARATIE']
y = df2['KOSTENRUBRIEK - declared']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
predictions = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print('\nClassification Report:')
print(classification_rep)

Accuracy: 0.92

Classification Report:
                             precision    recall  f1-score   support

                  equipment       0.95      0.95      0.95        19
          external services       0.87      0.91      0.89        22
infrastructure and building       0.94      0.94      0.94        16
          preparation costs       0.90      0.82      0.86        22
               travel costs       0.94      1.00      0.97        16

                   accuracy                           0.92        95
                  macro avg       0.92      0.92      0.92        95
               weighted avg       0.92      0.92      0.92        95



In [46]:
# Use model to predict on negative targets
df_category_1 = df[df['category'] == 1]

# Extract text data and labels
X_category_1 = df_category_1['BESCHRIJVING DECLARATIE']
y_category_1 = df_category_1['KOSTENRUBRIEK - declared']

# Vectorize the text data using the same TF-IDF vectorizer
X_category_1_tfidf = vectorizer.transform(X_category_1)

# Make predictions on the instances where 'category' has value 1
predictions_category_1 = classifier.predict(X_category_1_tfidf)

# Evaluate the model on these instances
accuracy_category_1 = accuracy_score(y_category_1, predictions_category_1)
classification_rep_category_1 = classification_report(y_category_1, predictions_category_1)

print(f'Accuracy: {1-accuracy_category_1:.2f}')
print('\nClassification Report:')
print(classification_rep_category_1)

Accuracy: 0.95

Classification Report:
                             precision    recall  f1-score   support

                  equipment       0.00      0.00      0.00         8
          external services       0.00      0.00      0.00         8
infrastructure and building       0.00      0.00      0.00         3
          preparation costs       0.25      0.50      0.33         2
               travel costs       0.00      0.00      0.00         1

                   accuracy                           0.05        22
                  macro avg       0.05      0.10      0.07        22
               weighted avg       0.02      0.05      0.03        22



In [34]:
# Save the updated DataFrame to a new Excel file
output_file_path = 'output_file.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Updated Excel file saved to: {output_file_path}")

Updated Excel file saved to: output_file.xlsx
