# Prepare  datasets <a class="tocSkip">

In this notebook, we create three datasets: $\mathcal{E}$ and $\mathcal{D}$ that contain samples from various normal and non-normal distributions, and $\mathcal{C}$ which contains samples from non-normal distributions on which statisticians usually estimate power of standard statistical tests of normality.

## Set up the environment

In [None]:
import os
import pathlib
import pickle

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split

import util

Set the data directory path.

In [None]:
# Get the path to the directory to which the datasets will be stored
data_directory_path = 'data'

## Set $\mathcal{E}$

The samples consist of $10, 20, \ldots, 100$ elements. 

The normal samples are drawn from a a normal distributions $N(\mu,\sigma^2)$ whose location parameter ($\mu$) is randomly selected from the range $[-100,100]$ and the standard deviation is randomly drawn from the range $[1, 20]$. For each $n$, a total of $6525$ normal distributions are defined and a sample of size $n$ is drawn from each of them.

The non-normal samples are drawn from the Pearson family of distributions. Each distribution is specified by its first four moments. The mean and standard deviation are determined the same as for the normal samples. They are combined with the skewness ($s$) and kurtosis ($k$) that range over $\{\}$ and $\{\}$ and fulfill the following two conditions: (1) $k - s^2 - 1 \geq 0$ and (2) $\neg(s=0 \land k=3)$. The first condition is a limitation known from theory. The second requirement is there to ensure that those non-normal distributions are sufficiently different from the normal ones, since for normal distributions it holds that $s = 0$ and $k = 3$. A sample is drawn from each such distribution.

The set is balanced. It contains $65250$ normal and $65250$ non-normal samples.

In [None]:
# Define the ranges for the sample sizes 
# and non-normal skewness and kurtosis in the set A
n_range = range(10, 101, 10)
s_range = [x/10.0 for x in range(-300, 301, 5)] # skewness range -300, 301, 5;-150, 151, 5;-805, 810, 5
k_range = [x/10.0 for x in range(0, 401, 5)]   # kurtosis range 0, 401, 5;0, 201, 5;0, 1610, 5

# Let M denote the number of non-normal samples drawn from the same distribution.
# Since the set is created as balanced, M will influence the number of normal samples
# in the set. See the function generate_dataset for details.
M = 5

# Create a function that generates datasets
def generate_dataset(n_range, s_range, k_range, M, verbose=True):
    # Generate non-normal samples
    nonnormal_samples = util.generate_pearson_nonnormal_samples(s_range, k_range, n_range, M)

    # Calculate L, the number of normal samples of the same size
    L = len(nonnormal_samples) // len(n_range)
            
    # Generate L normal samples of size n for each n in n_range
    normal_samples = util.generate_normal_samples(n_range, L)

    # Print how many samples were generated
    if verbose:
        print("Normal samples: ", len(normal_samples))
        print("Non-normal samples: ", len(nonnormal_samples))

    # Label the sets
    normal_samples = util.label_samples(normal_samples, 1)
    nonnormal_samples = util.label_samples(nonnormal_samples, 0)

    # Unify them
    all_samples = normal_samples + nonnormal_samples
    
    return all_samples

In [None]:
set_E = generate_dataset(n_range, s_range, k_range, M)

Save the set.

In [None]:
path = os.path.join(data_directory_path, '{}.data'.format(set_name))
util.save_to_file(all_samples, path)
print("Saved to the file", path)

## Set $\mathcal{C}$

This set contains non-normal samples whose sizes are $10, 20, \ldots, 100$.

The non-normal distributions from which the samples are drawn are hand-picked and are usually used to assess the empirical power of normality tests. They are clssified into four groups. $G_1, G_2, G_3$ and $G_4$. See the paper for more details. For each sample size $n \in \left\{10, 20, \ldots, 100\right\}$, $L=10000$ samples are drawn from each group.

Define the distributions.

In [None]:
# Define the distributions from group G1
logistic = lambda n: stats.logistic.rvs(size = n)
laplace = lambda n: stats.laplace.rvs(size = n)
t1 = lambda n: stats.t.rvs(1, size = n)
t3 = lambda n: stats.t.rvs(3, size = n)

# Define the distributions from group G2
gumbel1 = lambda n: list(np.random.gumbel(loc=0, scale=1, size=n))
gumbel2 = lambda n: list(np.random.gumbel(loc=0, scale=2, size=n))
gumbel3 = lambda n: list(np.random.gumbel(loc=0, scale=0.5, size=n))

# Define the distributions from group G3
expon = lambda n: stats.expon.rvs(loc = 1, size = n)
gamma1 = lambda n: list(np.random.gamma(2, scale = 1, size = n))
gamma2 = lambda n: list(np.random.gamma(0.5, scale = 1, size = n))
lognormal1 = lambda n: list(np.random.lognormal(mean = 0, sigma = 1, size = n))
lognormal2 = lambda n: list(np.random.lognormal(mean = 0, sigma = 2, size = n))
lognormal3 = lambda n: list(np.random.lognormal(mean = 0, sigma = 0.5, size = n))
weibull1 = lambda n: stats.weibull_min.rvs(0.5, scale = 1, size = n)
weibull2 = lambda n: stats.weibull_min.rvs(2, scale = 1, size = n)

# Define the distributions from group G4
uniform = lambda n: list(np.random.uniform(low = 0, high = 1, size = n))
beta1 = lambda n: np.random.beta(2, 2, size = n)
beta2 = lambda n: np.random.beta(0.5, 0.5, size = n)
beta3 = lambda n: np.random.beta(3, 1.5, size = n)
beta4 = lambda n: np.random.beta(2, 1, size = n)

# Let groups be a dictionary containing distributions from four groups G1-G4
groups = {
    1 : [logistic, laplace, t1, t3],
    2 : [gumbel1, gumbel2, gumbel3],
    3 : [expon, gamma1, gamma2, lognormal1, lognormal2, lognormal3, weibull1, weibull2],
    4 : [uniform, beta1, beta2, beta3, beta4]
}

Generate and save the samples from the groups.

In [None]:
# Define the range of sample sizes
n_range = range(10, 101, 10)

# Define L, which denotes how many samples with n elements
# will be drawn from each group.
L = 10000

set_C = {}

for g in groups:
    # Select a group
    group = groups[g]
    
    # Prepare the storage to hold the samples from this group
    samples = []
    
    # Draw the samples
    for n in n_range:
        # Initialize the counter of the samples generated so far in this group
        so_far = 0
        
        # Iterate over the distributions in the group until L samples are generated
        d = 0
        while so_far <= L:
            # Get the distribution whose turn is to generate a sample
            dist = group[d]
            
            # Generate a sample
            sample = dist(n)
            
            # Store it
            samples.append(sample)
            
            # Increase the counters
            so_far = so_far + 1
            d = d + 1
            
            # Return to the first distribution in the group and start over
            if d == len(group):
                d = 0
    
    # Label the samples as non-normal
    samples = util.label_samples(samples, 0)
    
    # Describe the set
    set_name = 'C-G{}'.format(g)
    filename = '{}.data'.format(set_name)
    
    path = os.path.join(data_directory_path, filename)
                
    # Save the samples
    util.save_to_file(samples, path)
    print("Saved {} to the file {}".format(set_name, path))
    
    set_C[set_name] = samples

## Set $\mathcal{D}$

Same as $\mathcal{E}$, but with fewer samples.

In [None]:
# Define the ranges for the sample sizes 
# and non-normal skewness and kurtosis 
n_range = range(10, 101, 10)
s_range = [x/10.0 for x in range(-300, 301, 5)] # skewness range -150, 151, 5;-805, 810, 5
k_range = [x/10.0 for x in range(0, 401, 5)]   # kurtosis range 0, 201, 5;0, 1610, 5

# Let M denote the number of non-normal samples drawn from the same distribution.
# Since the set is created as balanced, M will influence the number of normal samples
# in the set. See the function generate_dataset for details.
M = 1

# Generate and save the set
set_D = generate_dataset(n_range, s_range, k_range, M)

path = os.path.join('data', 'D.data')
util.save_to_file(set_D, path)
print("Saved to the file", path)