# Matching Distributions

## Probability Distribution

In [1]:
def get_pdf(data):
    from scipy.stats.kde import gaussian_kde
    from numpy import linspace
    
    kde = gaussian_kde(data)

    return kde

## Plot Probability Distribution

In [2]:
def plot_pdf(data, label='', x_axis=''):
    from scipy.stats.kde import gaussian_kde
    from numpy import linspace
    
    # create two subplots
    fig, axs = plt.subplots(1, 2)
    fig.suptitle("Probability Distribution and Historgram "+label)
    
    #create the kernel, which estimates the probability over the values in the array
    kde = gaussian_kde(data)
    
    #these are the values over wich your kernel will be evaluated
    dist_space = linspace(min(data),max(data), 100)

    # plot the results
    axs[0].plot(dist_space, kde(dist_space))
    axs[0].set_ylabel('probability')
    axs[0].set_xlabel(x_axis)
    axs[1].hist(data)
    axs[1].set_ylabel('histogram')
    axs[1].set_xlabel('x_axis')
    plt.show()

## Find Rough Distribution

In [5]:
def get_dist(df, var='age', bucket_size=5, to_print=False):
    import math
    
    # create an array that holds the counts of different age groups in bucket_size 
    arr = [];
    for x in range(0,100,bucket_size):
        arr.append(0)
    
    # go through the data, look at each age and update counter for that age group
    for x in df[var]:
        index = math.floor(x/bucket_size)
        arr[index] = arr[index] + 1;

    if to_print:
        age = 0;
        # print the array
        for x in arr:
            print(str(age) + "-" + str(age+bucket_size), ": ", x)
            age = age + bucket_size
    
    return arr

## New Distribution Size

In [6]:
def find_new_len(len_df_one, dist_one, len_df_two, dist_two):
    import math 
    
    new_len = len_df_one

    for precision in np.arange(1, 0, -0.05):
        new_len = len_df_one
        for i in range(len(dist_one)):
            if dist_one[i] < (new_len*dist_two[i]/len_df_two)*precision:
                if dist_one[i] != 0:
                    new_len = math.floor(dist_one[i]*len_df_two/dist_two[i])
                else:
                    new_len = new_len - (new_len*dist_two[i]/len_df_two)
        if new_len >= 50:
            print("New Length: ", new_len, " with Precision: ", precision)
            break
            
    return new_len

## Match Distributions

In [7]:
def match_dist(df_to, df_from, var='age', var_1 = 'subject', bucket_size=3, new_len=None):
    import math 
    
    df_new = pd.DataFrame([])
    
    # sort the dataframes
    df_to = df_to.sort_values(by=var)
    df_from = df_from.sort_values(by=var)
    
    # get the distributions from both
    dist_to = get_dist(df_to, bucket_size=bucket_size)
    dist_from = get_dist(df_from, bucket_size=bucket_size)
    
    index = 0
    i = 0
    rand_subj = []
    
    if new_len:
        df_new_length = new_len
    else:
        # calculate the size of the new dataset that would allow for the same distribution as the dataset being mimicked
        df_new_length = find_new_len(len(df_to), dist_to, len(df_from), dist_from)
    
    # go through the number of subjects in the age range of the resampling data
    for x in np.arange(0, 100, bucket_size):
        subj_bucket = []
        
        # get all the subjects in the age range of the resampling data
        for j in range(dist_to[i]):
            subj_bucket.append((df_to[var_1])[index+j])
        
        index = index + dist_to[i]
        
        # get the number of subjects in the data that is being mimicked
        num_rand = math.floor(df_new_length * (dist_from[i]/len(df_from)))
        #print(len(subj_bucket), num_rand)
        
        if not len(subj_bucket) == 0:
            # choose random subjects from subject bucket
            # the number of random subjects chosen depends on the distributions
            rand_subj = []
            
            if dist_to[i] < num_rand:
                rand_subj = np.random.choice(subj_bucket, size=dist_to[i], replace=False)
            else:
                rand_subj = np.random.choice(subj_bucket, size=num_rand, replace=False)
        
        for sub in rand_subj:
                temp = pd.DataFrame(df_to[df_to[var_1] == sub])
                df_new = pd.concat([df_new, df_to[df_to[var_1] == sub]])

        # increment index for age distribution array
        i = i+1
    
    return df_new

In [8]:
def sample(df_to, df_from, title_to='', title_from='', var='age', var_1='subject', bucket_size=3, new_len = None):
    from scipy import stats

    plot_pdf(df_from[var], title_from, var)
    plot_pdf(df_to[var], title_to , var)
    
    # sample from original
    df_sample = match_dist(df_to, df_from, bucket_size = bucket_size, new_len = new_len)
    print("Actual Length: ", len(df_sample))
    
    # plot new probability density function
    plot_pdf(df_sample[var], title_to + ":" + title_from, var)
    
    # test if the two distributions would be the same
    print("stats: ", stats.ks_2samp(df_sample[var], df_from[var]))
    
    return df_sample