Prepare data for generating synthetic population.

# Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.nn import functional as F

# Global variables

In [None]:
MODEL_CONFIG = 'model/config.json'

DATA_PATH = 'data/raw_data.csv'

# ------------- Train, Test, Validation Set ----------- #
TEST_DF_PATH = 'data/test_df.csv'
TRAIN_DF_PATH = 'data/train_df.csv'
VALID_DF_PATH = 'data/valid_df.csv'

H_POP_PATH = 'data/h_population.csv'

# Util Functions

In [None]:
def plot_attribute_distribution(df, color=None):
    # check the distribution of each attribute
    # iterate over columns in dataframe
    for col in df.columns:
        # count the frequency of each attribute
        value_counts = df[col].value_counts()
        # create a bar chart of the frequency of each attribute
        plt.figure()
        if color is not None:
            plt.bar(value_counts.index, value_counts.values, color=color)
        else:
            plt.bar(value_counts.index, value_counts.values)
        plt.title(col)
        plt.show()

In [None]:
def print_uniq_val(df):
    # Create an empty dictionary to store unique values for each column
    unique_values_dict = {}

    # Iterate through each column of the dataframe
    for column_name in df.columns:
        # Get the unique values of the current column
        unique_values = df[column_name].unique().tolist()

        # Sort the unique values
        unique_values.sort()

        # Store the sorted unique values in the dictionary
        unique_values_dict[column_name] = unique_values

    # Print the unique values for each column
    for column_name, unique_values in unique_values_dict.items():
        print(f"Column name: {column_name}")
        print(f"Unique values: {unique_values}")
        print(f"Total unique values: {len(unique_values)}")
        print("\n")

In [None]:
def random_sample(df, n, random_state):
    """
    Returns a tuple of two dataframes:
    the first dataframe contains n randomly sampled rows from the input dataframe without replacement,
    the second dataframe contains the remaining rows that were not sampled.
    """
    sample_df = df.sample(n=n, replace=False, random_state=random_state)
    remaining_df = df.drop(sample_df.index)

    return sample_df, remaining_df

In [None]:
def count_unique_elements(my_list):
    # Create an empty list to store unique elements
    unique_list = []

    # Loop through each element in the list
    for element in my_list:
        # Check if the element is not already in the unique list
        if element not in unique_list:
            # If it's not, add it to the unique list
            unique_list.append(element)
            unique_list.sort()
            max_vocab = max(unique_list)

    # Return the length of the unique list and the unique list itself
    return len(unique_list), unique_list, max_vocab

# Import data

In [None]:
df = pd.read_csv(DATA_PATH)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604519 entries, 0 to 604518
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   age      604519 non-null  int64
 1   sex      604519 non-null  int64
 2   marst    604519 non-null  int64
 3   classwk  604519 non-null  int64
dtypes: int64(4)
memory usage: 18.4 MB


In [None]:
df.head()

Unnamed: 0,age,sex,marst,classwk
0,68,2,1,1
1,17,1,1,0
2,35,1,1,1
3,71,1,2,1
4,59,2,2,3


In [None]:
ordered_col = ['sex', 'age', 'marst', 'classwk']
df = df[ordered_col]

In [None]:
df.head()

Unnamed: 0,sex,age,marst,classwk
0,2,68,1,1
1,1,17,1,0
2,1,35,1,1
3,1,71,2,1
4,2,59,2,3


# Assigning new values to categorical variables



In [None]:
df['sex'] = df['sex'] + 200
df['marst'] = df['marst'] + 300
df['classwk'] = df['classwk'] + 400

In [None]:
df

Unnamed: 0,sex,age,marst,classwk
0,202,68,301,401
1,201,17,301,400
2,201,35,301,401
3,201,71,302,401
4,202,59,302,403
...,...,...,...,...
604514,201,46,302,403
604515,201,62,302,401
604516,202,60,302,403
604517,201,23,301,400


In [None]:
# export the tokenized dataset
df.to_csv(H_POP_PATH, header=True, index=False)

# Split train, validation, test sets

In [None]:
len(df)

604519

In [None]:
test_size = int(np.floor(0.1*len(df)))
remaining_size = len(df) - test_size
train_size = int(np.floor(0.8*remaining_size))
valid_size = remaining_size - train_size

In [None]:
print(f"test_size: {test_size}")
print(f"train_size: {train_size}")
print(f"valid_size: {valid_size}")

test_size: 60451
train_size: 435254
valid_size: 108814


In [None]:
test_size + train_size + valid_size == len(df)

True

In [None]:
# randomly select "testing set" from the whole raw dataset
test_df, remaining_df = random_sample(df, n=test_size, random_state=2023)

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60451 entries, 305426 to 381520
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   sex      60451 non-null  int64
 1   age      60451 non-null  int64
 2   marst    60451 non-null  int64
 3   classwk  60451 non-null  int64
dtypes: int64(4)
memory usage: 2.3 MB


In [None]:
remaining_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 544068 entries, 1 to 604518
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   sex      544068 non-null  int64
 1   age      544068 non-null  int64
 2   marst    544068 non-null  int64
 3   classwk  544068 non-null  int64
dtypes: int64(4)
memory usage: 20.8 MB


In [None]:
# Split training and validation sets
train_df, valid_df = random_sample(remaining_df, n=train_size, random_state=2023)

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 435254 entries, 182410 to 257976
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   sex      435254 non-null  int64
 1   age      435254 non-null  int64
 2   marst    435254 non-null  int64
 3   classwk  435254 non-null  int64
dtypes: int64(4)
memory usage: 16.6 MB


In [None]:
valid_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108814 entries, 3 to 604509
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   sex      108814 non-null  int64
 1   age      108814 non-null  int64
 2   marst    108814 non-null  int64
 3   classwk  108814 non-null  int64
dtypes: int64(4)
memory usage: 4.2 MB


In [None]:
print(f'train size: {len(train_df)}')
print(f'validation size: {len(valid_df)}')
print(f'test size: {len(test_df)}')

train size: 435254
validation size: 108814
test size: 60451


In [None]:
# Save the testing, training, validation sets as CSV
test_df.to_csv(TEST_DF_PATH, header=True, index=False)

train_df.to_csv(TRAIN_DF_PATH, header=True, index=False)

valid_df.to_csv(VALID_DF_PATH, header=True, index=False)

In [None]:
# get the elements of the dataframe as a list
train_list = train_df.values.flatten().tolist()
valid_list = valid_df.values.flatten().tolist()

# print the resulting list
print("Example of the train list:")
print(train_list[:20])
print("#--------------------#")
print("Example of the valid list:")
print(valid_list[:20])
print("#--------------------#")


train_data = torch.tensor(train_list, dtype=torch.long)
val_data = torch.tensor(valid_list, dtype=torch.long)

Example of the train list:
[201, 56, 302, 401, 202, 17, 301, 400, 202, 6, 301, 400, 202, 79, 304, 400, 201, 20, 301, 400]
#--------------------#
Example of the valid list:
[201, 71, 302, 401, 202, 36, 301, 403, 201, 44, 302, 400, 201, 20, 301, 400, 202, 34, 302, 403]
#--------------------#


In [None]:
print(train_data[-8:])
print(train_data[:8])
print(val_data[:8])
print(val_data[-8:])

tensor([202,   3, 301, 400, 201,  42, 302, 401])
tensor([201,  56, 302, 401, 202,  17, 301, 400])
tensor([201,  71, 302, 401, 202,  36, 301, 403])
tensor([201,  15, 301, 400, 202,   8, 301, 400])


# Vocab_size

In [None]:
# calculate the vocab_size for training model later on
vocab_size, unique_list, max_vocab = count_unique_elements(train_list)
print(f"vocab_size: {vocab_size}")
print(f"unique_list: {unique_list}")
print(f"maximum in vocab: {max_vocab}")

vocab_size: 112
unique_list: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 201, 202, 300, 301, 302, 303, 304, 309, 400, 401, 402, 403, 409]
maximum in vocab: 409


In [None]:
# Data to be written
dictionary = {
    "vocab_size": vocab_size,
    "max_vocab": max_vocab,
    "block_size": df.shape[1],
    "batch_size": 32,
	"eval_interval": 10,
	"learning_rate": 0.0001,
	"eval_iters": 10,
	"n_embd": 512,
	"n_head": 8,
	"n_layer": 6,
	"dropout": 0.1
}

# Serializing json
json_object = json.dumps(dictionary, indent=4)

# Writing to sample.json
with open(MODEL_CONFIG, "w") as outfile:
	outfile.write(json_object)