In [1]:
import pandas as pd
import numpy as np
import faker
import random as rnd
import datetime
import calendar

In [2]:
# importing decimals just for good practice, as it is best for monetary value handling.
from decimal import Decimal, getcontext
# we will use only two decimal places.
getcontext().prec = 2

In [3]:
# importing my custom functions

# 1 Plan of (totally legal) falsification

First of let's plan this out. We will try and simulate bank extract so we better think about its format first: <br>
['id','value','date'] <br>
This will be our extract values, pretty minimalist.

Beyond that, what type of expenses are we going to make? I'll just decide some easy ones:

- groceries
- streaming
- shopping
- Food Delivery
- travell
- transportation
- pix?
- games
- education
- health

Yes, that order really tracks, doesn't it.

## 1.1 creating our scenary

I will make a list to decide some "non-random" information. For example, let's say our simulated person started paying for their health insurance only in the second year, or that they only pay for their anime streaming service a few months a year. Just to make it more interesting. <br>
Maybe I should automate this to, but I will try this manual method for now.

In [4]:
# Let's create some fixed behavior for our subject
# [id:str, start-date:str, end-date:str, value:float]
# I though of creating another tag for recurrence such as ["monthly", "daily", etc.], but I will just do monthly.

fixed_behavior = [
    ["health insurance", "2025-01-01", "2025-6-01", Decimal(250)],
    ["health insurance", "2025-07-01", "2025-12-01", Decimal(300)],
    ["braces", "2024-06-01", "2025-06-01", Decimal(100)],
    ["gym", "2024-01-25", "2024-07-01", Decimal(70)],
    ["gym", "2024-07-25", "2025-06-01", Decimal(85)],
    ["netflix", "2024-03-10", "2025-04-01", Decimal(25)],
    ["netflix", "2025-04-10", "2025-12-01", Decimal(35)],
    ["crunchyroll", "2024-05-08", "2024-08-01", Decimal(15)],
    ["crunchyroll", "2024-11-08", "2025-03-01", Decimal(15)],
    ["italian course", "2025-06-01", "2025-11-01", Decimal(320)],
]

In [5]:
# possible expenses
# [id:str, possible values: list[Decimal], prob. distribution of values: dict[float]]
[
    ["Lamen", [Decimal(34.99), Decimal(19.99)], [0.3,0.7]],
    ["Pizza", [Decimal(59.99), Decimal(30)], [0.4,0.6], 'wknd'],
    ["Feijoada", [Decimal(32.00), Decimal(24.99)], [0.5,0.5]],
    ["Carangueijo", [Decimal(50.00)], [1], 'quinta'],
    ["supermercado", [Decimal(350)],[1], 'monthly']
]

[['Lamen',
  [Decimal('34.99000000000000198951966012828052043914794921875'),
   Decimal('19.989999999999998436805981327779591083526611328125')],
  [0.3, 0.7]],
 ['Pizza',
  [Decimal('59.99000000000000198951966012828052043914794921875'),
   Decimal('30')],
  [0.4, 0.6],
  'wknd'],
 ['Feijoada',
  [Decimal('32'),
   Decimal('24.989999999999998436805981327779591083526611328125')],
  [0.5, 0.5]],
 ['Carangueijo', [Decimal('50')], [1], 'quinta'],
 ['supermercado', [Decimal('350')], [1], 'monthly']]

In [6]:
# Categories
{
    "groceries"
}

{'groceries'}

# Setting classes

In [47]:
class Expense():
    '''
    Represents a possible expense for an ID and some possible values to be determined by some probability distribution.
    
    Attributes:
        id: name that represents the expense
        possible_values: possible "bundle" value of the type of expense.
        dist_value: probability of each bundle to be consumed.
        dis_week: distribution of the possible consumption throughout the week
    '''
    
    
    def __init__(self, id:str, possible_values:list[Decimal], dist_values:list[float], dist_week:str = 'constant'):
        self.id = id
        self.possible_values = possible_values
        self.dist_values = dist_values
        self.day_distributions = {
        'wknd': [0.1,0.1,0.1,0.1,0.1,0.25,0.25],
        'quinta': [0,0,0,1,0,0,0],
        'constant': [1/7 for x in range(7)],
        'monthly': []
    }
        self.dist_week = self.day_distributions[dist_week]
        
    def generate_expense_test(self, year, month):
        '''
        Generate expense row for id in certain date, with value based on given distribution.
        
        Return:
            dict row with Id,Value and Date.
        '''
        entry_value = rnd.choices(self.possible_values, weights = self.dist_values)[0]
        if len(self.dist_week) == 7:
            # weekly distribution
            week_day = rnd.choices([x for x in range(7)], weights = self.dist_week)[0]
            entry_possibilities = get_weekdays(year, month, week_day)
            entry_date = rnd.choice(entry_possibilities)
        else:
            # month distribution
            entry_day = rnd.choice([x+1 for x in range(calendar.monthrange(year,month)[-1])])
            entry_date = datetime.date(year,month,entry_day)
        
        return [self.id, entry_value, entry_date]
    

In [8]:
# add distribution for day of week -> default:uniform wknd: weekend heavy (50%) and custom(like input "quinta do carangueijo" bias)

def generate_expense(id:str, possible_values:list[Decimal], dist_values:list[float], date:str):
    '''
    Generate expense row for id in certain date, with value based on given distribution.
    
    Return:
        dict row with Id,Value and Date.
    '''
    expense_value = -rnd.choice(possible_values, p = dist_values)
    expense_row = {"id": id, 
                   "value": expense_value,
                   "date": date}
    return expense_row

In [9]:
class Subscription():
    def __init__(self, id:str, start_date:datetime, end_date:datetime, value = Decimal()):
        self.id = id
        self.start_date = start_date
        self.end_date = end_date
    def generate_subscriptions():
        pass

# Making some useful functions


In [13]:
def get_weekdays(year:int, month:int, week_day:int):
    '''
    Get all specific days of week (ex. monday) from certain month and year.
    
    Return:
        list of dates on certain month
    '''
    month_start = datetime.date(year,month,1)
    start_date = month_start
    week_delta = week_day - month_start.weekday()
    if week_delta >= 0:
        start_date = month_start + datetime.timedelta(days=week_delta)
    if week_delta <0:
        start_date = month_start + datetime.timedelta(days=week_delta + 7)
        
    # now we have the first date for that weekday, lets iterate until we are out of bounds on the month.
    iterate_date = start_date
    weekday_list = []
    while month_start.month == iterate_date.month:
        weekday_list.append(iterate_date)
        iterate_date = iterate_date + datetime.timedelta(days= 7)
        print(iterate_date)
    return weekday_list

In [14]:
# add distribution for day of week -> default:uniform wknd: weekend heavy (50%) and custom(like input "quinta do carangueijo" bias)

def generate_expense(id:str, possible_values:list[Decimal], dist_values:list[float], date:str):
    '''
    Generate expense row for id in certain date, with value based on given distribution.
    
    Return:
        dict row with Id,Value and Date.
    '''
    expense_value = -rnd.choice(possible_values, p = dist_values)
    expense_row = {"id": id, 
                   "value": expense_value,
                   "date": date}
    return expense_row

In [15]:
def generate_subscription():
    expense_row = {}
    return expense_row

# Generation


In [16]:
rnd.choice([1,2,3])

3

In [17]:
fake = faker.Faker(["pt_BR"])

In [18]:
fake.name()

'Brenda Gomes'

In [19]:
fake.domain_name()

'da.br'