# Import libraries

In [None]:
import math
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Process, Queue
import numpy as np
import pandas as pd
import sklearn.preprocessing as skp
import time
import torch
import torch.nn as nn
from torch.nn import functional as F
from imblearn.over_sampling import SMOTENC
import threading
import json

# Global variable

In [None]:
MODEL_CONFIG = 'model/config.json'

DATA_PATH = 'data/raw_data.csv'

# ------------- Train, Test, Validation Set ----------- #
TEST_DF_PATH = 'data/test_df.csv'
TRAIN_DF_PATH = 'data/train_df.csv'
VALID_DF_PATH = 'data/valid_df.csv'

H_POP_PATH = 'data/h_population.csv'

# ------------------- Model Training ----------------- #
device = 'cuda' if torch.cuda.is_available() else 'cpu'
random_state = 2023
torch.manual_seed(random_state)
np.random.seed(random_state)
debug = False
MODEL_PATH = 'model/tfm.pt'

TOKEN_PATH = 'generated_tokens/tfm.csv'
SAVING_PATH = 'results/tfm.csv'

In [None]:
GENERATED_SIZE = 1000 # @param {type:"slider", min:1000, max:100000, step:500}


In [None]:
# Load model config
with open(MODEL_CONFIG, 'r') as openfile:
    prm_json = json.load(openfile)

batch_size = prm_json["batch_size"]
block_size = prm_json["block_size"]
eval_interval = prm_json["eval_interval"]
learning_rate = prm_json["learning_rate"]
eval_iters = prm_json["eval_iters"]
n_embd = prm_json["n_embd"]
n_head = prm_json["n_head"]
n_layer = prm_json["n_layer"]
dropout = prm_json["dropout"]
vocab_size = prm_json["vocab_size"]
max_vocab = prm_json["max_vocab"]


print(f"""batch_size: {batch_size}, block_size: {block_size},
eval_interval: {eval_interval}, learning_rate: {learning_rate},
eval_iters: {eval_iters}, n_embd: {n_embd},
n_head: {n_head}, n_layer: {n_layer}, dropout: {dropout},
vocab_size: {vocab_size}, max_vocab: {max_vocab}""")

batch_size: 32, block_size: 4,
eval_interval: 10, learning_rate: 0.0001,
eval_iters: 10, n_embd: 512,
n_head: 8, n_layer: 6, dropout: 0.1,
vocab_size: 112, max_vocab: 409


# Util Functions

In [None]:
def binning(df, column, bin_edges, labels=None):
    """
    Function that performs binning for a numeric column in a Pandas dataframe.
    df: Pandas dataframe
    column: name of the numeric column to be binned
    bin_edges: list of bin edges to use
    labels: list of labels to assign to the bins
    """
    # Use the Pandas cut function to bin the column
    df[column + '_binned'] = pd.cut(df[column], bins=bin_edges, labels=labels)

    return df

In [None]:
def compare_attributes(df1, df2, columns_to_plot=None, legend_labels=None):
    if columns_to_plot is None:
        columns_to_plot = df1.columns

    # Iterate over each column in the dataframes
    for i, col in enumerate(columns_to_plot):
        # Create a new figure and axis object for each attribute
        fig, ax = plt.subplots()

        # Get the frequencies for each unique value in the column
        freq1 = df1[col].value_counts()
        freq2 = df2[col].value_counts()

        # Combine the frequencies into a single dataframe
        freq_df = pd.concat([freq1, freq2], axis=1, keys=['df1', 'df2'])

        # Plot the frequencies as a bar chart
        freq_df.plot(kind='bar', ax=ax, rot=0)

        # Set the title for the subplot
        ax.set_title(col)

        # Add a legend to the plot
        plt.legend(legend_labels)

        # Show the plot
        plt.show()


In [None]:
def count_unique_elements(my_list):
    # Create an empty list to store unique elements
    unique_list = []

    # Loop through each element in the list
    for element in my_list:
        # Check if the element is not already in the unique list
        if element not in unique_list:
            # If it's not, add it to the unique list
            unique_list.append(element)
            unique_list.sort()
            max_vocab = max(unique_list)

    # Return the length of the unique list and the unique list itself
    return len(unique_list), unique_list, max_vocab

In [None]:
# Convert the model output into a pandas dataframe
def create_dataframe(data_list, num_columns, header=None):
    # Calculate the number of rows needed based on the length of the data_list
    # and the specified number of columns
    num_rows = len(data_list) // num_columns + (len(data_list) % num_columns > 0)

    # Create a 2D array with the data_list values
    data_array = [data_list[i:i+num_columns] for i in range(0, len(data_list), num_columns)]

    # Add empty values to the end of each row to ensure all rows have the same length
    for i in range(len(data_array)):
        while len(data_array[i]) < num_columns:
            data_array[i].append(None)

    # Create a Pandas dataframe from the 2D array
    df = pd.DataFrame(data_array, columns=header)

    return df


In [None]:
def flatten_list(nested_list):
    flat_list = []
    for inner_list in nested_list:
        for element in inner_list:
            flat_list.append(element)

    return flat_list

In [None]:
## Train and Test Splits
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    if debug == True:
        print(f"ix: {ix}")

    x = torch.stack([data[i:i+block_size] for i in ix])

    if debug == True:
        print(f"x: {x}")

    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    if debug == True:
        print(f"y: {y}")

    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):

            if debug == True:
                print(f"k: {k}")

            X, Y = get_batch(split)

            if debug == True:
                print(f"X: {X}")
                print(f"Y: {Y}")

            logits, loss = model(X, Y)

            if debug == True:
                print(f"logits: {logits}")
                print(f"loss: {loss}")

            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
def plot_attribute_distribution(df):
    # check the distribution of each attribute
    # iterate over columns in dataframe
    for col in df.columns:
        # count the frequency of each attribute
        value_counts = df[col].value_counts()
        # create a bar chart of the frequency of each attribute
        plt.figure()
        plt.bar(value_counts.index, value_counts.values)
        plt.title(col)
        plt.show()

In [None]:
def print_uniq_val(df):
    # Create an empty dictionary to store unique values for each column
    unique_values_dict = {}

    # Iterate through each column of the dataframe
    for column_name in df.columns:
        # Get the unique values of the current column
        unique_values = df[column_name].unique().tolist()

        # Sort the unique values
        unique_values.sort()

        # Store the sorted unique values in the dictionary
        unique_values_dict[column_name] = unique_values

    # Print the unique values for each column
    for column_name, unique_values in unique_values_dict.items():
        print(f"Column name: {column_name}")
        print(f"Unique values: {unique_values}")
        print(f"Total unique values: {len(unique_values)}")
        print("\n")

In [None]:
def random_sample(df, n, random_state, replace=False):
    """
    Returns a tuple of two dataframes:
    the first dataframe contains n randomly sampled rows from the input dataframe without replacement,
    the second dataframe contains the remaining rows that were not sampled.
    """
    sample_df = df.sample(n=n, replace=replace, random_state=random_state)
    remaining_df = df.drop(sample_df.index)

    return sample_df, remaining_df


In [None]:
## Standardization
# transform numerical features (normalization)
def transform_continuous(dataset, continuous_cols):
    """
    dataset: Raw dataset not yet normalized

    continuous_cols: a list of continuous columns
    """

    # scaler = MinMaxScaler(feature_range = (0, 1))

    scaler = skp.StandardScaler()

    columns_to_scale = continuous_cols
    scaled_array = scaler.fit_transform(dataset.loc[:, columns_to_scale])
    df_scaled = pd.DataFrame(scaled_array, columns=scaler.get_feature_names_out())
    return df_scaled


# inverse transform normalized vectors
def inverse_transform_continuous(original_dataset, synthetic_dataset, col_list):
    # scaler = skp.MinMaxScaler(feature_range = (0, 1))
    scaler = skp.StandardScaler()
    columns_to_normalize = col_list
    scaler.fit(original_dataset[columns_to_normalize])
    fake_attribute = scaler.inverse_transform(synthetic_dataset[scaler.get_feature_names_out()])
    df = pd.DataFrame(fake_attribute, columns = columns_to_normalize)

    return df

# Validate the generated data

Load the tokenized data to be used as checklist for the generated tokens.

In [None]:
df = pd.read_csv(H_POP_PATH) # both train and validate sets

In [None]:
df

Unnamed: 0,sex,age,marst,classwk
0,202,68,301,401
1,201,17,301,400
2,201,35,301,401
3,201,71,302,401
4,202,59,302,403
...,...,...,...,...
604514,201,46,302,403
604515,201,62,302,401
604516,202,60,302,403
604517,201,23,301,400


In [None]:
df.columns

Index(['sex', 'age', 'marst', 'classwk'], dtype='object')

In [None]:
sex_set = list(df['sex'].unique())
age_set = list(df['age'].unique())
marst_set = list(df['marst'].unique())
classwk_set = list(df['classwk'].unique())

Retrieve the tokens

In [None]:
# load the saved tokens using numpy
gen_result = np.loadtxt(TOKEN_PATH, delimiter = ',', dtype='i4')

# convert the loaded result to list
gen_result = list(gen_result)
# print(len(gen_result))
token_count_before_validate = len(gen_result)
print(token_count_before_validate)

4001


In [None]:
# preview head and tail of the generated tokens
print(gen_result[:block_size])
print(gen_result[-block_size:])

[0, 301, 400, 202]
[402, 202, 9, 301]


In [None]:
# Remove irrelevant tokens
gen_result = gen_result[3:] # remove first few tokens
gen_result = gen_result[:-3] # remove last few tokens

In [None]:
# preview head and tail
print(gen_result[:block_size])
print(gen_result[-block_size:])

[202, 24, 301, 400]
[201, 62, 302, 402]


In [None]:
# print(f'len checking tokens: {len(gen_result)}')

In [None]:
%%time

"""
We have to validate the tokens based on
the orders of their corresponding columns.
"""

all_ex = []
ex = []
lex = len(ex)

while (len(gen_result) > 0) and (lex < block_size):
    item = gen_result.pop(0)
    if(lex == 0) and (item in sex_set): # first column
        ex.append(item)
    elif(lex == 1) and (item in age_set): # second column
        ex.append(item)
    elif(lex == 2) and (item in marst_set): # third column
        ex.append(item)
    elif(lex == 3) and (item in classwk_set): # third column
        ex.append(item)

    lex = len(ex)

    if lex == block_size:
        all_ex.append(ex)
        ex = []
    lex = len(ex)

CPU times: user 17.2 ms, sys: 0 ns, total: 17.2 ms
Wall time: 29.3 ms


In [None]:
all_ex[:2]

[[202, 24, 301, 400], [202, 36, 309, 401]]

In [None]:
token_count_after_validate = len(all_ex)*block_size

In [None]:
# print(f"Total tokens after position validation: {len(all_ex)*block_size}")
print(f"Total tokens after position validation: {token_count_after_validate}")
print(f"Total generated examples: {len(all_ex)}")

Total tokens after position validation: 3968
Total generated examples: 992


In [None]:
rae = (token_count_before_validate - token_count_after_validate)/token_count_before_validate

In [None]:
print(f"RAE = {rae:.4f}")

RAE = 0.0082


In [None]:
syn_df = pd.DataFrame(all_ex, columns = df.columns)
print(syn_df)

     sex  age  marst  classwk
0    202   24    301      400
1    202   36    309      401
2    201   43    302      401
3    202   44    302      402
4    202    8    301      400
..   ...  ...    ...      ...
987  201   77    302      401
988  201    2    301      400
989  201   19    301      400
990  201   23    301      400
991  201   62    302      402

[992 rows x 4 columns]


In [None]:
syn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   sex      992 non-null    int32
 1   age      992 non-null    int32
 2   marst    992 non-null    int32
 3   classwk  992 non-null    int32
dtypes: int32(4)
memory usage: 15.6 KB


In [None]:
print_uniq_val(syn_df)

Column name: sex
Unique values: [201, 202]
Total unique values: 2


Column name: age
Unique values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 88]
Total unique values: 84


Column name: marst
Unique values: [300, 301, 302, 303, 304, 309]
Total unique values: 6


Column name: classwk
Unique values: [400, 401, 402, 403, 409]
Total unique values: 5




In [None]:
syn_df.to_csv(SAVING_PATH, header=True, index=False)