In [2]:
import gym
from gym import spaces
import numpy as np
import pandas as pd

# Load dataset
df = pd.read_csv("synthetic_tax_transactions.csv")

# Convert categorical values to numeric
category_mapping = {cat: idx for idx, cat in enumerate(df["category"].unique())}
merchant_mapping = {mer: idx for idx, mer in enumerate(df["merchant"].unique())}
payment_mapping = {pay: idx for idx, pay in enumerate(df["payment_method"].unique())}

df["category"] = df["category"].map(category_mapping)
df["merchant"] = df["merchant"].map(merchant_mapping)
df["payment_method"] = df["payment_method"].map(payment_mapping)

# Convert date to numeric (days since first transaction)
df["date"] = pd.to_datetime(df["date"])
df["date"] = (df["date"] - df["date"].min()).dt.days

# Normalize amount
df["amount"] = (df["amount"] - df["amount"].min()) / (df["amount"].max() - df["amount"].min())

class TaxOptimizationEnv(gym.Env):
    def __init__(self, data):
        super(TaxOptimizationEnv, self).__init__()
        self.data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data
        self.current_index = 0  # Track transaction index
        
        # Action space: 0 = Don't deduct, 1 = Deduct
        self.action_space = spaces.Discrete(2)
        
        # Observation space: 5 features
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=np.float32)
        
        self.state = self._get_next_transaction()
        self.episode_ended = False

    def _get_next_transaction(self):
        """Fetch the next transaction and convert to a state vector."""
        if self.current_index >= len(self.data):
            self.current_index = 0  # Restart from beginning
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle again
        
        row = self.data.iloc[self.current_index]
        self.current_index += 1
        
        return np.array([
            row["amount"],  
            row["category"],  
            row["merchant"],  
            row["payment_method"],  
            row["tax_deductible"]  
        ], dtype=np.float32)

    def reset(self):
        self.state = self._get_next_transaction()
        self.episode_ended = False
        return self.state

    def step(self, action):
        if self.episode_ended:
            return self.reset()

        correct_action = self.state[4]  # Tax deductible (1) or not (0)
        reward = 1 if action == correct_action else -1  # Reward for correct classification

        self.state = self._get_next_transaction()
        done = False  # In this case, episodes can continue indefinitely
        return self.state, reward, done, {}



In [6]:
import torch
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv

env = TaxOptimizationEnv(df)
env = DummyVecEnv([lambda: env])  # Stable-Baselines3 requires vectorized environments

# Initialize the DQN agent
model = DQN('MlpPolicy', env, learning_rate=1e-3, buffer_size=10000, batch_size=64, verbose=1)

# Train the model
model.learn(total_timesteps=10000)

# Evaluate the model
obs = env.reset()
for _ in range(1000):  # Simulate 1000 steps
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    if done:
        obs = env.reset()




Using cuda device
