# Periodogram for degen data

In [5]:
from scipy import signal
import numpy as np
import pandas as pd
import tqdm
import os

In [2]:
def _read_data(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            num = list(map(float, line.split()))
            data.append(num)
            count += 1
            if count >= N:
                break
    return data

def compute_freqs_powers(data, scaling: str = 'density'):
    """
    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.periodogram.html
    :param data:
    :return:
    """
    freqs, powers = [], []
    for i in tqdm.tqdm(range(len(data))):
        f, p = signal.periodogram(data[i], scaling=scaling)
        freqs.append(f)
        powers.append(p)
    return freqs, powers


def fp_pipeline(data_file, N=np.inf) -> pd.DataFrame:
    data_list = _read_data(data_file) # Read all data
    data_arr = np.concatenate([np.asarray(d) for d in data_list])
    mean_data = np.mean(data_arr)
    sd_data = np.std(data_arr)

    if N < np.inf:
        data_norm = [(np.asarray(d) - mean_data)/sd_data for d in data_list[:N]]
    else:
        data_norm = [(np.asarray(d) - mean_data)/sd_data for d in data_list]
    freqs, powers = compute_freqs_powers(data_norm)
    df = pd.DataFrame.from_dict({
        'freq': np.concatenate(freqs),
        'power': np.concatenate(powers)
    })
    return df

## Unconditional outputs

In [6]:
# Unconditional output
input_folder = 'data/data_degen/unconditional/'
output_folder = 'plot/degen/'

os.makedirs(output_folder, exist_ok=True)

In [7]:
# gold
input_file = 'unconditional_gold.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5819.98it/s]


In [8]:
# Pure sampling
input_file = 'unconditional_puresampling_large.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5675.18it/s]


In [9]:
# Top k=40
input_file = 'unconditional_topk_k=40_t=0.7_large.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 6347.54it/s]


In [10]:
# Top k=640
input_file = 'unconditional_topk_k=640_large.model=gpt2.nll'
df = fp_pipeline(input_folder + input_file)
df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5200.96it/s]


In [11]:
# Sampling with different temperatures
# t = [0.1, 0.2, ..., 0.9]
for t in range(1, 10):
    input_file = f'unconditional_sampling_t=0.{t}_large.model=gpt2.nll'
    df = fp_pipeline(input_folder + input_file)
    df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5748.90it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5889.59it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6819.76it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6392.82it/s]
100%|██████████| 5000/5000 [00:00<00:00, 7260.61it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6986.65it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6395.91it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6950.18it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6554.53it/s]


In [12]:
# Top k with different k values [5, 10, 20, ..., 640]
for k in [5, 10, 20, 40, 80, 160, 320, 640]:
    input_file = f'unconditional_topk_k={k}_large.model=gpt2.nll'
    df = fp_pipeline(input_folder + input_file)
    df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5174.84it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5397.63it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5729.89it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5570.28it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5528.92it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5910.05it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5736.04it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5470.23it/s]


In [14]:
# Top p with p from [0.1, 0.2, ..., 0.9]
for p in range(1, 10):
    input_file = f'unconditional_topp_p=0.{p}_large.model=gpt2.nll'
    df = fp_pipeline(input_folder + input_file)
    df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5507.41it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4967.91it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5524.70it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5649.74it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5148.59it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5600.00it/s]
100%|██████████| 5000/5000 [00:01<00:00, 3379.87it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5013.83it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4769.77it/s]


In [16]:
# Top p with p from [95, 975, 9875, 99375]
for p in [95, 975, 9875, 99375]:
    input_file = f'unconditional_topp_p=0.{p}_large.model=gpt2.nll'
    df = fp_pipeline(input_folder + input_file)
    df.to_csv(output_folder + input_file[:-4] + '.density.csv', index=False)

100%|██████████| 5000/5000 [00:00<00:00, 5475.07it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5534.99it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5760.35it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5810.39it/s]
