> The PARSynthesizer is designed to work on multi-sequence data, which means that there are multiple sequences (usually belonging to different entities) present within the same dataset. This means that your metadata should include a sequence_key. Using this information, the PARSynthesizer creates brand new entities and brand new sequences for each one.

> If your dataset contains only a single sequence of data, then the PARSynthesizer is not suited for your dataset.

In [1]:
from pathlib import Path
import pandas as pd

from sdv.metadata import Metadata
from sdv.utils import get_random_sequence_subset
from sdv.sequential import PARSynthesizer

## California

In [2]:
califonia_data_dir = Path('data/california/train')

In [3]:
california_data = {
        'household': pd.read_parquet(califonia_data_dir / 'household' / 'household.parquet'),
        'individual': pd.read_parquet(califonia_data_dir / 'individual' / 'individual.parquet')
    }

In [4]:
california_data['household'].head(3)

Unnamed: 0,household_id,FARM,OWNERSHP,ACREHOUS,TAXINCL,PROPINSR,COSTELEC,VALUEH,ROOMS,PLUMBING,PUMA
0,2408216,0,2,1,0,0,8,99,4,2,0
1,2408218,0,0,0,0,0,0,99,0,0,0
2,2408219,0,2,0,0,0,6,99,4,2,0


In [5]:
california_data['individual'].head(3)

Unnamed: 0,individual_id,RELATE,SEX,AGE,MARST,RACE,CITIZEN,SPEAKENG,SCHOOL,EDUC,GRADEATT,SCHLTYPE,EMPSTAT,CLASSWKR,INCTOT,DISABWRK,household_id
0,2,0,0,45,0,0,0,2,1,6,0,1,1,1,6,1,2408216
1,3,1,1,42,0,0,0,2,1,8,0,1,1,2,3,1,2408216
2,4,5,0,7,5,0,0,2,2,1,3,2,0,0,0,0,2408216


First, we need to join a context (household) with corresponding sequences (individual)

In [6]:
california_merged_data = pd.merge(california_data['household'], california_data['individual'], on="household_id").sample(frac=1).reset_index(drop=True)
california_merged_data.head(3)

Unnamed: 0,household_id,FARM,OWNERSHP,ACREHOUS,TAXINCL,PROPINSR,COSTELEC,VALUEH,ROOMS,PLUMBING,...,CITIZEN,SPEAKENG,SCHOOL,EDUC,GRADEATT,SCHLTYPE,EMPSTAT,CLASSWKR,INCTOT,DISABWRK
0,2408216,0,2,1,0,0,8,99,4,2,...,0,2,1,6,0,1,1,1,6,1
1,2408216,0,2,1,0,0,8,99,4,2,...,0,2,1,8,0,1,1,2,3,1
2,2408216,0,2,1,0,0,8,99,4,2,...,0,2,2,1,3,2,0,0,0,0


In [7]:
california_context_cols = california_data['household'].columns.drop('household_id').tolist()

In [8]:
california_metadata = Metadata.detect_from_dataframe(data=california_merged_data, table_name='california')

In [9]:
california_metadata.update_column(column_name='household_id', table_name='california', sdtype='id')
california_metadata.set_sequence_key(column_name='household_id', table_name='california')

In [10]:
california_metadata

{
    "tables": {
        "california": {
            "primary_key": "individual_id",
            "sequence_key": "household_id",
            "columns": {
                "household_id": {
                    "sdtype": "id"
                },
                "FARM": {
                    "sdtype": "categorical"
                },
                "OWNERSHP": {
                    "sdtype": "categorical"
                },
                "ACREHOUS": {
                    "sdtype": "categorical"
                },
                "TAXINCL": {
                    "sdtype": "categorical"
                },
                "PROPINSR": {
                    "sdtype": "numerical"
                },
                "COSTELEC": {
                    "sdtype": "numerical"
                },
                "VALUEH": {
                    "sdtype": "numerical"
                },
                "ROOMS": {
                    "sdtype": "categorical"
                },
                "PLUMBING": {

In [11]:
california_metadata.validate()

In [12]:
california_out_path = Path('data/sdv_preprocessed_data/california/')
california_out_path.mkdir(exist_ok=True, parents=True)
california_merged_data.to_parquet(california_out_path / 'data.parquet')
california_metadata.save_to_json(filepath=california_out_path / 'metadata.json')

## Baseball

In [13]:
baseball_data_dir = Path('data/baseball/train')

In [14]:
baseball_data = {
        'players': pd.read_parquet(baseball_data_dir / 'players' / 'players.parquet'),
        'fielding': pd.read_parquet(baseball_data_dir / 'fielding' / 'fielding.parquet')
    }

In [15]:
baseball_data['players'].head(3)

Unnamed: 0,playerID,birthCountry,birthDate,deathDate,nameFirst,nameLast,weight,height,bats,throws
0,aardsda01,USA,1981-12-27,,David,Aardsma,215.0,75.0,R,R
1,aaronha01,USA,1934-02-05,,Hank,Aaron,180.0,72.0,R,R
2,aaronto01,USA,1939-08-05,1984-08-16,Tommie,Aaron,190.0,75.0,R,R


In [16]:
baseball_data['fielding'].head(3)

Unnamed: 0,playerID,yearID,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP
0,abercda01,1871,TRO,,SS,1,1.0,24.0,1,3,2.0,0
1,addybo01,1871,RC1,,2B,22,22.0,606.0,67,72,42.0,5
2,addybo01,1871,RC1,,SS,3,3.0,96.0,8,14,7.0,0


First, we need to join a context (players) with corresponding sequences (fielding)

In [17]:
baseball_merged_data = pd.merge(baseball_data['players'], baseball_data['fielding'], on="playerID").sample(frac=1).reset_index(drop=True)
baseball_merged_data.head(3)

Unnamed: 0,playerID,birthCountry,birthDate,deathDate,nameFirst,nameLast,weight,height,bats,throws,...,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP
0,aardsda01,USA,1981-12-27,,David,Aardsma,215.0,75.0,R,R,...,SFN,NL,P,11,0.0,32.0,0,0,0.0,0
1,aardsda01,USA,1981-12-27,,David,Aardsma,215.0,75.0,R,R,...,CHN,NL,P,45,0.0,159.0,1,5,0.0,1
2,aardsda01,USA,1981-12-27,,David,Aardsma,215.0,75.0,R,R,...,CHA,AL,P,25,0.0,97.0,2,4,1.0,0


In [18]:
baseball_context_cols = baseball_data['players'].columns.drop('playerID').tolist()

In [19]:
baseball_metadata = Metadata.detect_from_dataframe(data=baseball_merged_data, table_name='baseball')

In [20]:
baseball_metadata.update_column(column_name='playerID', table_name='baseball', sdtype='id')
baseball_metadata.set_sequence_key(column_name='playerID', table_name='baseball')

In [21]:
baseball_metadata

{
    "tables": {
        "baseball": {
            "sequence_key": "playerID",
            "columns": {
                "playerID": {
                    "sdtype": "id"
                },
                "birthCountry": {
                    "sdtype": "categorical"
                },
                "birthDate": {
                    "sdtype": "datetime",
                    "datetime_format": "%Y-%m-%d"
                },
                "deathDate": {
                    "sdtype": "datetime",
                    "datetime_format": "%Y-%m-%d"
                },
                "nameFirst": {
                    "sdtype": "categorical"
                },
                "nameLast": {
                    "sdtype": "categorical"
                },
                "weight": {
                    "sdtype": "numerical"
                },
                "height": {
                    "sdtype": "numerical"
                },
                "bats": {
                    "sdtype": "categori

In [22]:
baseball_metadata.validate()

In [23]:
baseball_out_path = Path('data/preprocessed_data/baseball/')
baseball_out_path.mkdir(exist_ok=True, parents=True)
baseball_merged_data.to_parquet(baseball_out_path / 'data.parquet')
baseball_metadata.save_to_json(filepath=baseball_out_path / 'metadata.json')

In [24]:
# Use this function to subsample data from your dataset. Given multi-sequence data, this function will randomly select sequences 
# and clip them to the desired length.
# baseball_merged_data = get_random_sequence_subset(
#     baseball_merged_data, 
#     baseball_metadata,
#     # num_sequences=100,
#     # max_sequence_length=None,
#     # long_sequence_subsampling_method='first_rows',
# )

In [25]:
# # Step 1: Create the synthesizer
# synthesizer = PARSynthesizer(baseball_metadata, context_columns=baseball_context_cols, cuda=True, verbose=True)
# # Step 2: Train the synthesizer
# synthesizer.fit(baseball_merged_data)
# # Step 3: Generate synthetic data
# synthetic_baseball_data = synthesizer.sample()