# Periodogram for degen data

In [1]:
from scipy import signal
import numpy as np
import pandas as pd
import tqdm

In [2]:
def _read_data(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            num = list(map(float, line.split()))
            data.append(num)
            count += 1
            if count >= N:
                break
    return data

def compute_freqs_powers(data, scaling: str = 'density'):
    """
    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.periodogram.html
    :param data:
    :return:
    """
    freqs, powers = [], []
    for i in tqdm.tqdm(range(len(data))):
        f, p = signal.periodogram(data[i], scaling=scaling)
        freqs.append(f)
        powers.append(p)
    return freqs, powers


def fp_pipeline(data_file, N=np.inf) -> pd.DataFrame:
    data_list = _read_data(data_file) # Read all data
    data_arr = np.concatenate([np.asarray(d) for d in data_list])
    mean_data = np.mean(data_arr)
    sd_data = np.std(data_arr)

    if N < np.inf:
        data_norm = [(np.asarray(d) - mean_data)/sd_data for d in data_list[:N]]
    else:
        data_norm = [(np.asarray(d) - mean_data)/sd_data for d in data_list]
    freqs, powers = compute_freqs_powers(data_norm)
    df = pd.DataFrame.from_dict({
        'freq': np.concatenate(freqs),
        'power': np.concatenate(powers)
    })
    return df

## Unconditional outputs

In [3]:
# Unconditional output
input_folder = 'data/data_degen/unconditional/'
output_folder = 'plot/degen/'

# gold
input_file = 'unconditional_gold.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 24813.17it/s]


In [4]:
# Pure sampling
input_file = 'unconditional_puresampling_large.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 25879.52it/s]
