In [1]:
import random
import numpy as np
from scipy import stats

In [None]:
# [-1,-0.1] | [0.1, 1.0]
# [-0.8, 1.0]

In [10]:
def uniform_sample():
    v = np.random.uniform(-0.8,1)
    v = v if v > 0.1 else v - 0.2
    return v

In [48]:
class Node:
    def __init__(self, name, parents):
        self.name = name
        self.parents = parents
        self.coefficients = [uniform_sample() for _ in parents]
        self.intercept = uniform_sample()
        self.value = None
        
    def get(self, num_samples):
        if self.value is None:
            self.value = self._calc(num_samples)
        return self.value
    
    def reset(self):
        self.value = None
        for p in self.parents:
            p.reset()
        
    def _calc(self, num_samples):
        val = np.random.normal(0,1, num_samples)
        if len(self.parents) > 0:
            val += self.intercept 
        for parent, coeff in zip(self.parents, self.coefficients):
            val += parent.get(num_samples)*coeff
            
        return val
    
    def __repr__(self):
        return f'Node {self.name} - descends from {", ".join([p.name for p in self.parents]) if len(self.parents) > 0 else "none"}'

In [49]:
def toposort(variables):
    sorted_list = []
    no_inc_edge_nodes = {v for v in variables if len(v.parents) == 0}
    graph = {v:set(v.parents) for v in variables if len(v.parents) > 0}
    
    while len(no_inc_edge_nodes) > 0:
        node = no_inc_edge_nodes.pop()
        sorted_list.append(node)
        for node_i in [v for v,ps in graph.items() if node in ps]:
            graph[node_i].remove(node)
            if len(graph[node_i]) == 0:
                no_inc_edge_nodes.add(node_i)
                
    return sorted_list

In [50]:
a = Node('A', [])
b = Node('B', [])
c = Node('C', [a,b])

In [51]:
c.get(100)

array([-1.14354762, -1.1808442 , -2.15612981, -1.54237387, -1.99227455,
       -0.71277121, -0.45862334, -1.13007901, -0.17098833,  0.80733743,
       -3.51476953, -0.23604779, -1.67093401,  0.91471831,  1.50678496,
        0.16002049,  1.48924346, -0.77883134,  1.61978097,  1.17019914,
       -0.40439477, -3.20148715, -1.53600654, -0.19507694,  0.98761621,
       -2.80804387, -0.35995236, -1.79169207,  0.31790072, -0.87709724,
       -1.07679657, -0.56644735, -1.81967697, -0.28349953, -0.61510872,
       -1.59233984,  1.05810717, -1.69719084,  0.59369585, -0.1301698 ,
        0.79103481, -1.08517266, -2.29520691, -1.72455794,  1.68745879,
       -0.32304163,  1.68314824, -3.42699345, -0.7298698 , -2.38586044,
       -1.70100038, -2.68157888, -1.3962608 , -1.94862243, -3.42774711,
        1.86695217, -1.00801078,  0.34370335, -3.90365918, -1.95534581,
       -0.97236106, -1.36194245,  2.02423733,  0.68519315,  1.17082893,
       -0.64832675, -1.72607051,  3.00006305, -0.12337808,  1.34

In [53]:
import polars as pl

In [59]:
t= pl.Series(name=c.name, values=c.get(100))

In [None]:
# TODO: qcut with 2,3, or 4 categories
# TODO: parameterize names and breakpoints
# TODO: let breakpoint proportions be sampled. At least 15% per category, rest randomly

In [106]:
import random

def get_quantiles(num_cats, min_perc_per_cat):
    num_objects = int((1.0-num_cats*min_perc_per_cat)*100)

    containers = [[] for _ in range(num_cats)]
    objects = (1 for _ in range(num_objects))
    # we don't need a list of these, just have to iterate over it, so this is a genexp

    for object in objects:
        random.choice(containers).append(object)
        
    containers = [sum(c)/100 for c in containers]
    containers


    containers = [c+min_perc_per_cat for c in containers]
    #assert sum(containers) == 1

    quantiles = [containers[0]]
    for i in range(1,len(containers)-1):
        quantiles.append(containers[i] + quantiles[-1])
        
    return quantiles

In [111]:
num_cats = np.random.randint(2,5)
min_perc_per_cat = 0.15

quantiles = get_quantiles(num_cats, min_perc_per_cat)
quantiles

[0.33999999999999997, 0.6699999999999999]

In [112]:
t.qcut(quantiles, labels=[str(i) for i in range(1,len(quantiles)+2)])

C
cat
"""2"""
"""2"""
"""1"""
"""1"""
"""1"""
…
"""2"""
"""2"""
"""2"""
"""2"""
