## Numpy random numbers generation

In [54]:
import numpy as np
import pandas as pd
from numpy.random import seed
#from numpy.random import rand      #Uniform
from numpy.random import randn     #gaussian
#from numpy.random import randint   #integers
import yaml

from dstools.config.baseconfig import YmlConfig
from dstools.sqlutils.sqlconfig import SQLConfig
from dstools.sqlutils.util import DbConnectGenerator

In [54]:
import numpy as np
import pandas as pd
from numpy.random import seed
#from numpy.random import rand      #Uniform
from numpy.random import randn     #gaussian
#from numpy.random import randint   #integers
import yaml

from dstools.config.baseconfig import YmlConfig
from dstools.sqlutils.sqlconfig import SQLConfig
from dstools.sqlutils.util import DbConnectGenerator

In [55]:
def generate_random_numbers(distribution, params, n, seed):
    rng = np.random.default_rng(seed = seed)
    if distribution == 'normal':
        return rng.normal(params['mean'], params['std'], n)
    elif distribution == 'beta':
        return rng.beta(params['a'], params['b'], n)
    elif distribution == 'uniform':
        return rng.uniform(params['low'], params['high'], n)
    elif distribution == 'binomial':
        return rng.binomial(params['n'], params['p'], n)
    else:
        raise ValueError(f"Invalid distribution: {distribution}")

In [56]:
def normalize_rows(matrix):
    
    if len(matrix.shape)==1:
        if matrix.sum()==0:
            pass
        else: matrix = matrix / matrix.sum()
        
    else:    
        for i in range(matrix.shape[0]):
        # Get the sum of the current row
            row_sum = np.sum(matrix[i,:])

            # If the row sum is not zero, normalize the row
            if row_sum != 0:
                matrix[i,:] = matrix[i,:] / row_sum
    return matrix

### Generation of deductions based on yaml distributions
Steps:
For each deduction:
1. Generate random 0s and 1s based on the binomial probability of having the deduction (several have very low probability)
2. Generate random numbers based on the distribution of no-zero values, based on the distribution provided in the yaml file (data_gen_config.yaml)
3. Multiply the vector from step 1 with the vector from step 2
4. Normalize the vector so to sum up to one if there are deductions, or sum up to zero if there are no deductions at all.

In [90]:
class gen_random_proportions():
    """
    Class to generate a random sample of n records of multiple deductions or
    [net income, effective tax rate and aggregate deductions],
    or any set of proportions, normalized (sum up to 1) or not normalized.
    Result is given in a dataframe.
    
    """
    
    def __init__(self, features_p, distribution, parameters,
                  n = 40, seed = 5, normalized = True, export_csv = False, debug=False, agg=True):
        """
        Args
        features_p: is a dictionary which the name of the feature as keys, and 
        the probability of that features to be different than 0 values
            for example, for a case of 3 deductions: {'deduc_1': 0.9, 'deduc_2': 0.001369, 'deduc_3': 0.017539}

        distribution: is a dictionary  with the distributions of each feature
            for example: {'deduc_1': 'uniform', 'deduc_2': 'uniform', 'deduc_3': 'uniform'}, 

        parameters: are the parameters of the distributions above.
            for example: {'deduc_1': {'low': 0, 'high': 1}, 'deduc_2': {'low': 0, 'high': 1}, 'deduc_3': {'low': 0, 'high': 1}}

        Distribution supported and its parameters are defined in the function generate_random_numbers
        """
        self.features_p = features_p
        self.distribution = distribution
        self.parameters = parameters
        self.n = n
        self.seed = seed
        self.normalized = normalized
        self.export_csv  = export_csv
        self.debug=debug
        self.agg=agg

    def get_sample(self):
        """
        Steps:
        For each deduction:
        1. Generate random 0s and 1s based on the binomial probability of having the feature
        2. Generate random numbers based on the distribution of no-zero values provided in the distribution
        3. Multiply the vector from step 1 with the vector from step 2
        4. Normalize the vector so to sum up to one if there are positive values for the features, or sum up to zero if there are not
        """
        is_first_iter = 1

        for feature, p in self.features_p.items():
            rng = np.random.default_rng(seed = self.seed)
            # step 1
            got_feature = rng.binomial(1, p, self.n)
            # step 2
            random_column = generate_random_numbers(self.distribution[feature], self.parameters[feature], self.n, self.seed)
            # step 3
            final_column = got_feature * random_column

            if is_first_iter == False:   
                matrix = np.column_stack((matrix, final_column))

            if is_first_iter:
                matrix = final_column
                is_first_iter = False
            self.seed +=1
        if self.debug:
            print("matrix without normalization: \n",matrix)

        if self.normalized:
            matrix = normalize_rows(matrix)
            if self.debug:
                print("matrix normalized: \n",matrix)
            
        matrix_df = pd.DataFrame(matrix, columns = list(self.distribution.keys()))
        
        if self.agg:
            matrix_df['agg'] = matrix_df.sum(axis=1)
            
        if self.export_csv:
            file_name = list(matrix_df.columns)[0][0:5]
            matrix_df.to_csv(f"{file_name}_seed_{self.seed}.csv")
        
        return matrix_df

In [91]:
class ProportionRecordGenerator(object):
    def __init__(self, config_path, n=40, #_type=TaxPayerType.individual, 
                 seed=None, debug=False, agg=True, export_csv=True) -> None:

        #self._type = _type
        self.debug = debug
        self.seed = (np.random.randint(0,np.iinfo(np.int32).max) if seed is None else seed)
        self.n = n
        self.export_csv=export_csv

        # get the config data
        with open(config_path,'r') as read_file:
            yml = yaml.safe_load(read_file)
        self.features_p = yml['features_p']
        self.distribution = yml['distribution']
        self.parameters = yml['parameters']
        self.agg = agg

    def gen_record(self):
        features_df = gen_random_proportions(features_p=self.features_p,
                                     distribution=self.distribution,
                                     parameters=self.parameters,
                                     n=self.n,
                                     seed=self.seed,
                                     normalized=True,
                                     export_csv=self.export_csv,
                                     agg = self.agg ).get_sample()

        return features_df

In [92]:
deduc_config_path='./deduc_gen_config.yaml'
deduc_rec = ProportionRecordGenerator(deduc_config_path, n=20, seed=5, agg=True)
deduc_rec.gen_record()

Unnamed: 0,deduc_x,deduc_y,deduc_16,deduc_17,agg
0,0.599332,0.400668,0.0,0.0,1.0
1,0.377809,0.16052,0.0,0.46167,1.0
2,0.582689,0.417311,0.0,0.0,1.0
3,0.432837,0.567163,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0
5,0.377285,0.622715,0.0,0.0,1.0
6,0.377239,0.622761,0.0,0.0,1.0
7,0.120657,0.879343,0.0,0.0,1.0
8,0.066913,0.933087,0.0,0.0,1.0
9,0.0,1.0,0.0,0.0,1.0


In [93]:
# Create net_income/effective_tax_rate/
tax_config_path='./income_tax_gen_config.yaml'
rec = RecordGenerator(tax_config_path, n=20, seed=5, agg=False)
rec.gen_record()                 

Unnamed: 0,net_income,effect_tax_rate,agg_deduc
0,0.555497,0.225153,0.21935
1,0.550823,0.298911,0.150266
2,0.952431,-0.152557,0.200126
3,0.645079,0.127148,0.227773
4,0.634729,0.219527,0.145744
5,0.650688,0.274744,0.074568
6,0.523802,0.175625,0.300573
7,0.504224,0.239587,0.256189
8,0.722167,0.182723,0.095111
9,0.645456,0.176125,0.178419
