## Functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import random 
from scipy.stats import poisson
from scipy.stats import pareto
from scipy.stats import zipf
import scipy.stats as stats

def create_powerlaw_p(F_dataset, pareto_alpha):
    #create p
    all_facts = F_dataset['fact'].tolist()
    new_facts = []
    for fact in all_facts:
        reps = pareto.rvs(b=pareto_alpha, scale=1)
        reps = int(np.floor(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    # print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def create_zipf_p(F_dataset, zipf_p):
    #create p
    all_facts = F_dataset['fact'].tolist()
    new_facts = []
    for fact in all_facts:
        reps = zipf.rvs(a=zipf_p)
        reps = int(np.floor(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def create_normal_p(F_dataset, mean, std_dev):
    #create p
    all_facts = F_dataset['fact'].tolist()
    new_facts = []
    for fact in all_facts:
        reps = np.random.normal(loc=mean, scale=std_dev)
        reps = max(1, reps)
        reps = int(np.floor(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def create_poisson_p(F_dataset, lambda_param):
    #create p
    all_facts = F_dataset['fact'].tolist()
    new_facts = []
    for fact in all_facts:
        reps = poisson.rvs(mu=lambda_param)
        reps = max(1, reps)
        reps = int(np.floor(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def create_uniform_p(F_dataset):
    #create p, which is already uniform
    new_facts = F_dataset['fact'].tolist()
    # print(f'Monofact % is: {mono_calc(new_facts)}')
    return new_facts

def sample(new_facts, size):
    #sample with replacement
    training_data = random.sample(new_facts, k=size)
    # print(f'Monofact in % sample is: {mono_calc(training_data)}')
    return training_data
    #print out key stats

def mono_calc(new_facts):
    fact_counts = Counter(new_facts)
    num_mono = sum(1 for count in fact_counts.values() if count == 1)
    mono_pct = num_mono / len(new_facts)
    return mono_pct
    

## Charts

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

def plot_distributions(uniform_p):
    # Set global plot parameters per ICML requirements
    plt.rcParams['font.family'] = 'Times New Roman'
    plt.rcParams['font.size'] = 11
    plt.rcParams['axes.titlesize'] = 11
    plt.rcParams['axes.labelsize'] = 11
    plt.rcParams['legend.fontsize'] = 11

    # Create 2x2 grid
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 6))

    x = range(1, 16, 1)

    # Pareto Distribution
    alphas = [1.5, 2, 2.5, 3]
    for alpha in alphas:
        powerlaw_p = create_powerlaw_p(uniform_p, alpha)
        p_facts_counter = Counter(powerlaw_p)
        repeats = list(p_facts_counter.values())
        freq_of_repeats = Counter(repeats)
        
        # x = range(1, 11)
        y = [freq_of_repeats.get(i, 0) for i in x]
        
        ax1.plot(x, y, marker='o', label=f'γ={alpha}', 
                linewidth=0.5, markersize=3)
    
    ax1.set_title('Pareto Distribution')
    ax1.set_yscale('symlog', linthresh=1.0)
    ax1.set_ylim(0, 100000)
    ax1.grid(True, linestyle='--', alpha=0.3)
    ax1.legend()
    
    # Zipf Distribution
    zipf_params = [1.5, 2, 2.5, 3]
    for p in zipf_params:
        zipf_p = create_zipf_p(uniform_p, p)
        p_facts_counter = Counter(zipf_p)
        repeats = list(p_facts_counter.values())
        freq_of_repeats = Counter(repeats)
        
        # x = range(1, 11)
        y = [freq_of_repeats.get(i, 0) for i in x]
        
        ax2.plot(x, y, marker='o', label=f'p={p}', 
                linewidth=0.5, markersize=3)
    
    ax2.set_title('Zipf Distribution')
    ax2.set_yscale('symlog', linthresh=1.0)
    ax2.set_ylim(0, 100000)
    ax2.grid(True, linestyle='--', alpha=0.3)
    ax2.legend()
    
    # Normal Distribution

    # Normal Distribution
    means = [2, 3, 4, 5]
    stds = [1, 1.5, 2, 2.5]
    for i in range(len(means)):
        normal_p = create_normal_p(uniform_p, means[i], stds[i])
        p_facts_counter = Counter(normal_p)
        repeats = list(p_facts_counter.values())
        freq_of_repeats = Counter(repeats)
    
        # x = range(1, 11)
        y = [freq_of_repeats.get(i, 0) for i in x]
    
        ax3.plot(x, y, marker='o', label=f'μ={means[i]}, σ={stds[i]}', 
                 linewidth=0.5, markersize=3)
    
    ax3.set_title('Normal Distribution')
    ax3.set_yscale('symlog', linthresh=1.0)
    ax3.set_ylim(0, 100000)
    ax3.grid(True, linestyle='--', alpha=0.3)
    ax3.legend()
    
    # Poisson Distribution
    lambdas = [1.5, 2, 2.5, 3]
    for lambda_param in lambdas:
        poisson_p = create_poisson_p(uniform_p, lambda_param)
        p_facts_counter = Counter(poisson_p)
        repeats = list(p_facts_counter.values())
        freq_of_repeats = Counter(repeats)
        
        # x = range(1, 11)
        y = [freq_of_repeats.get(i, 0) for i in x]
        
        ax4.plot(x, y, marker='o', label=f'λ={lambda_param}', 
                linewidth=0.5, markersize=3)
    
    ax4.set_title('Poisson Distribution')
    ax4.set_yscale('symlog', linthresh=1.0)
    ax4.set_ylim(0, 100000)
    ax4.grid(True, linestyle='--', alpha=0.3)
    ax4.legend()

    # Global figure adjustments
    for ax in [ax1, ax2, ax3, ax4]:
        ax.set_xlabel('Number of Appearances')
        ax.set_ylabel('Count of Statements')
        
    plt.tight_layout()
    plt.savefig("path", dpi=300, bbox_inches='tight')
    plt.show()

# Usage
uniform_p = pd.read_json('path', orient='records')
uniform_p = uniform_p[1:20000]
plot_distributions(uniform_p)