## Importieren Bibliotheken

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import grad

## Code zum Hochladen von Dateien

In [None]:
from google.colab import files
uploaded = files.upload()


Saving new_philldata1.csv to new_philldata1.csv


## Verarbeitung großer Datenmengen mit Dask und Standardisierung numerischer Spalten

In [None]:
import dask.dataframe as dd

# Load the dataset with explicit dtypes for problematic columns
data_path = 'new_philldata1.csv'
data = dd.read_csv(
    data_path,
    dtype={
        'contract_number': 'object',
        'sub_obj': 'object'
    }
)

# Select only numeric columns for standardization
numeric_cols = data.select_dtypes(include=['number']).columns
data_numeric = data[numeric_cols]

# Standardize the numeric data
data_numeric = data_numeric.map_partitions(
    lambda df: (df - df.mean()) / df.std(),
    meta={col: 'float64' for col in numeric_cols}
)

# Reassign standardized numeric columns to the original dataframe
for col in numeric_cols:
    data = data.assign(**{col: data_numeric[col]})

# Convert only numeric columns to float32
for col in numeric_cols:
    data = data.assign(**{col: data[col].astype('float32')})

# Compute the final result
data = data.compute()

# Check the result
print(data.head())


   fy        fm  check_date   document_no      dept  \
0 NaN  0.941375  2017-04-12  CHEK17119771  1.319343   
1 NaN  1.516865  2017-06-09  ACHD17177233  0.145450   
2 NaN -0.497348  2016-11-25  CHEK17063736  1.466080   
3 NaN -1.648327  2016-07-07  CHEK17000247 -0.955075   
4 NaN -1.648327  2016-07-08  ACHD17000233 -0.074655   

            department_title     char_            character_title sub_obj  \
0                42 COMMERCE -0.672462    02 PURCHASE OF SERVICES     231   
1  26 LICENSES & INSPECTIONS -0.672462    02 PURCHASE OF SERVICES     211   
2                     44 LAW -0.672462    02 PURCHASE OF SERVICES     258   
3                  11 POLICE -0.672462    02 PURCHASE OF SERVICES     260   
4                 23 PRISONS  0.605375  03 MATERIALS AND SUPPLIES     313   

                         sub_obj_title                   vendor_name  \
0                  OVERTIME MEALS 0231                  EAT AT JOE'S   
1                  TRANSPORTATION 0211   L & I Â TRAVEL IMPRES

## Definition von Generator und Kritiker (Critic) für ein GAN-Modell

In [None]:
# Define the Generator and Critic
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, z, condition):
        input_data = torch.cat([z, condition], dim=1)
        return self.model(input_data)

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x, condition):
        input_data = torch.cat([x, condition], dim=1)
        return self.model(input_data)

## Berechnung der Gradientenstrafe (Gradient Penalty) für stabileres GAN-Training

In [None]:
# Gradient Penalty
def gradient_penalty(critic, real_data, fake_data, condition):
    alpha = torch.rand(real_data.size(0), 1).to(real_data.device)
    interpolates = alpha * real_data + (1 - alpha) * fake_data
    interpolates = interpolates.requires_grad_(True)
    critic_output = critic(interpolates, condition)
    gradients = grad(outputs=critic_output, inputs=interpolates,
                     grad_outputs=torch.ones_like(critic_output),
                     create_graph=True, retain_graph=True)[0]
    gp = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gp


## Trainingsschleife für CTGAN

In [None]:
# Training Loop
def train_ctgan_wgan(generator, critic, data_loader, num_epochs, device):
    optimizer_G = optim.Adam(generator.parameters(), lr=1e-4, betas=(0.5, 0.9))
    optimizer_C = optim.Adam(critic.parameters(), lr=1e-4, betas=(0.5, 0.9))
    lambda_gp = 10

    for epoch in range(num_epochs):
        for real_data, condition in data_loader:
            real_data, condition = real_data.to(device), condition.to(device)
            batch_size = real_data.size(0)

            # Train Critic
            for _ in range(5):
                z = torch.randn(batch_size, noise_dim).to(device)
                fake_data = generator(z, condition).detach()
                critic_real = critic(real_data, condition).mean()
                critic_fake = critic(fake_data, condition).mean()
                gp = gradient_penalty(critic, real_data, fake_data, condition)
                loss_C = critic_fake - critic_real + lambda_gp * gp
                optimizer_C.zero_grad()
                loss_C.backward()
                optimizer_C.step()

            # Train Generator
            z = torch.randn(batch_size, noise_dim).to(device)
            fake_data = generator(z, condition)
            loss_G = -critic(fake_data, condition).mean()
            optimizer_G.zero_grad()
            loss_G.backward()
            optimizer_G.step()

## Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Assuming 'data' holds your preprocessed real data
# Select only numeric features for training
numeric_features = data.select_dtypes(include=['number']).columns
real_data = data[numeric_features].values  # Convert the 'data' DataFrame to a NumPy array, selecting only numeric columns

# Generate your synthetic data (replace with your actual generator and parameters)
# For example:
# num_synthetic_samples = 10000  # Define the number of synthetic samples you want
# noise = torch.randn(num_synthetic_samples, noise_dim).to(device)
# conditions = torch.tensor(conditions_data.values.astype(np.float32)).to(device) # Replace conditions_data with your actual condition data
# synthetic_data = generator(noise, conditions).detach().cpu().numpy()

# Assuming 'synthetic_data' is now generated
#  Generate synthetic data using your trained generator
# For demonstration purposes, let's create a random synthetic dataset
num_synthetic_samples = real_data.shape[0]  # Match the number of real samples
num_features = real_data.shape[1]
synthetic_data = np.random.rand(num_synthetic_samples, num_features) # Replace this with your actual synthetic data generation

# Combine real and synthetic data
real_labels = np.zeros(real_data.shape[0])  # Label for real data
synthetic_labels = np.ones(synthetic_data.shape[0])  # Label for synthetic data

# Combine the data and labels
combined_data = np.vstack((real_data, synthetic_data))
combined_labels = np.hstack((real_labels, synthetic_labels))

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.3, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     71695
         1.0       1.00      1.00      1.00     71642

    accuracy                           1.00    143337
   macro avg       1.00      1.00      1.00    143337
weighted avg       1.00      1.00      1.00    143337



## Decission tree

In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Combine real and synthetic data
real_labels = np.zeros(real_data.shape[0])  # Label for real data
synthetic_labels = np.ones(synthetic_data.shape[0])  # Label for synthetic data

# Combine the data and labels
data = np.vstack((real_data, synthetic_data))
labels = np.hstack((real_labels, synthetic_labels))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_model.predict(X_test)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print Metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Real", "Synthetic"]))


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Classification Report:
               precision    recall  f1-score   support

        Real       1.00      1.00      1.00     71695
   Synthetic       1.00      1.00      1.00     71642

    accuracy                           1.00    143337
   macro avg       1.00      1.00      1.00    143337
weighted avg       1.00      1.00      1.00    143337



## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


# Combine real and synthetic data
real_labels = np.zeros(real_data.shape[0])  # Label for real data (normal)
synthetic_labels = np.ones(synthetic_data.shape[0])  # Label for synthetic data (anomalies)

# Combine data and labels
data = np.vstack((real_data, synthetic_data))
labels = np.hstack((real_labels, synthetic_labels))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

# Train Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination="auto", random_state=42)
iso_forest.fit(X_train)

# Predict on the test set
# Isolation Forest predicts -1 for anomalies and 1 for normal points
y_pred_raw = iso_forest.predict(X_test)
y_pred = np.where(y_pred_raw == 1, 0, 1)  # Convert to 0 (normal) and 1 (anomaly)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print Metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Normal", "Anomaly"]))

Accuracy: 0.56
Precision: 0.63
Recall: 0.28
F1 Score: 0.39

Classification Report:
               precision    recall  f1-score   support

      Normal       0.54      0.84      0.65     71695
     Anomaly       0.63      0.28      0.39     71642

    accuracy                           0.56    143337
   macro avg       0.58      0.56      0.52    143337
weighted avg       0.58      0.56      0.52    143337



## XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


# Combine real and synthetic data
real_labels = np.zeros(real_data.shape[0])  # Label for real data
synthetic_labels = np.ones(synthetic_data.shape[0])  # Label for synthetic data

# Combine the data and labels
data = np.vstack((real_data, synthetic_data))
labels = np.hstack((real_labels, synthetic_labels))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

# Train XGBoost Classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,  # Avoid warning for label encoding in new versions of XGBoost
    eval_metric='logloss',   # Specify evaluation metric
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print Metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Real", "Synthetic"]))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Classification Report:
               precision    recall  f1-score   support

        Real       1.00      1.00      1.00     71695
   Synthetic       1.00      1.00      1.00     71642

    accuracy                           1.00    143337
   macro avg       1.00      1.00      1.00    143337
weighted avg       1.00      1.00      1.00    143337



## LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


# Combine real and synthetic data
real_labels = np.zeros(real_data.shape[0])  # Label for real data
synthetic_labels = np.ones(synthetic_data.shape[0])  # Label for synthetic data

# Combine the data and labels
data = np.vstack((real_data, synthetic_data))
labels = np.hstack((real_labels, synthetic_labels))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

# Convert data into LightGBM dataset format
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Define LightGBM parameters
params = {
    'objective': 'binary',        # Binary classification
    'metric': 'binary_logloss',   # Evaluation metric
    'boosting_type': 'gbdt',      # Gradient Boosting Decision Tree
    'num_leaves': 31,             # Maximum number of leaves in one tree
    'learning_rate': 0.1,         # Step size shrinkage
    'feature_fraction': 0.9       # Randomly select a subset of features
}

# Use a callback for early stopping
callbacks = [lgb.early_stopping(stopping_rounds=10)]

# Train the LightGBM model
lgb_model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],  # Provide both training and testing data for validation
    callbacks=callbacks                  # Use the early stopping callback
)

# Predict on the test set
y_pred_proba = lgb_model.predict(X_test)  # Probabilities
y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary predictions

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print Metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Real", "Synthetic"]))


[LightGBM] [Info] Number of positive: 167252, number of negative: 167199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 334451, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500079 -> initscore=0.000317
[LightGBM] [Info] Start training from score 0.000317
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 2.1902e-05	valid_1's binary_logloss: 2.1902e-05
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Classification Report:
               precision    recall  f1-score   support

        Real       1.00      1.00      1.00     71695
   Synthetic       1.00      1.00      1.00     71642

    accuracy                           1.00    143337
   macro avg  