## Numpy random numbers generation
Here is the generation of deductions and the aggregate deduction of them.

In [2]:
import numpy as np
import pandas as pd
from numpy.random import seed
#from numpy.random import rand      #Uniform
from numpy.random import randn     #gaussian
#from numpy.random import randint   #integers
import yaml

from dstools.config.baseconfig import YmlConfig
from dstools.sqlutils.sqlconfig import SQLConfig
from dstools.sqlutils.util import DbConnectGenerator

In [3]:
seed = 1

In [4]:
# Gaussian distribution
#help(randn)

In [5]:
with open('deduc_gen_config.yaml','r') as read_file:
    yml = yaml.safe_load(read_file)
    print(yml)
    print(type(yml))

{'naics': 81, 'size': 's', 'gen_sample_size': 1000, 'deduc_prob_usage': {'deduc_x': 0.9, 'deduc_16': 0.001369, 'deduc_17': 0.017539}, 'distribution': {'deduc_x': 'uniform', 'deduc_16': 'uniform', 'deduc_17': 'uniform'}, 'parameters': {'deduc_x': {'low': 0, 'high': 1}, 'deduc_16': {'low': 0, 'high': 1}, 'deduc_17': {'low': 0, 'high': 1}}}
<class 'dict'>


In [6]:
# n: corresponds to the number of deductions
# the probability corresponds to that type oif deduction appearing in the data for that industry and size
deduc_dic = yml['deduc_prob_usage']
len(deduc_dic)

3

In [7]:
def generate_random_numbers(distribution, params, n):
    seed = 1
    rng = np.random.default_rng(seed = seed)
    if distribution == 'normal':
        return rng.normal(params['mean'], params['std'], n)
    elif distribution == 'beta':
        return rng.beta(params['a'], params['b'], n)
    elif distribution == 'uniform':
        return rng.uniform(params['low'], params['high'], n)
    elif distribution == 'binomial':
        return rng.binomial(params['n'], params['p'], n)
    else:
        raise ValueError(f"Invalid distribution: {distribution}")

In [8]:
def normalize_rows(matrix):
    
    if len(matrix.shape)==1:
        if matrix.sum()==0:
            pass
        else: matrix = matrix / matrix.sum()
        
    else:    
        for i in range(matrix.shape[0]):
        # Get the sum of the current row
            row_sum = np.sum(matrix[i,:])

            # If the row sum is not zero, normalize the row
            if row_sum != 0:
                matrix[i,:] = matrix[i,:] / row_sum
    return matrix

### Generation of deductions based on yaml distributions
Steps:
For each deduction:
1. Generate random 0s and 1s based on the binomial probability of having the deduction (several have very low probability)
2. Generate random numbers based on the distribution of no-zero values, based on the distribution provided in the yaml file (data_gen_config.yaml)
3. Multiply the vector from step 1 with the vector from step 2
4. Normalize the vector so to sum up to one if there are deductions, or sum up to zero if there are no deductions at all.

In [9]:
# Generation of deductions based on yaml distributions
n = yml['gen_sample_size']
seed = 5
is_first_iter = 1

for deduc, p in deduc_dic.items():
    rng = np.random.default_rng(seed = seed)
    # step 1
    got_deduction = rng.binomial(1, p, n)
    #step 2
    random_column = generate_random_numbers(yml['distribution'][deduc], yml['parameters'][deduc], n)
    #step 3
    final_column = got_deduction * random_column
    
    if is_first_iter == False:   
        deduc_matrix = np.column_stack((deduc_matrix, final_column))
    
    if is_first_iter:
        deduc_matrix = final_column
        is_first_iter = False
    seed +=1        
    #print(deduc_matrix)

deduc_matrix = normalize_rows(deduc_matrix)
print(deduc_matrix)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [10]:
deduc_matrix_df = pd.DataFrame(deduc_matrix, columns = yml['distribution'])
deduc_matrix_df['agg_deduc'] = deduc_matrix_df.sum(axis=1)

deduc_matrix_df.to_csv("sample_seed5_n=1000.csv")

In [11]:
deduc_matrix_df

Unnamed: 0,deduc_x,deduc_16,deduc_17,agg_deduc
0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0
...,...,...,...,...
995,1.0,0.0,0.0,1.0
996,1.0,0.0,0.0,1.0
997,1.0,0.0,0.0,1.0
998,1.0,0.0,0.0,1.0
