In [None]:
#Training libraries
import torch
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

#Evaluation metrics
from sdv.evaluation.single_table import evaluate_quality

### Train Model

This training uses the Aggregated stats users from official Meta Kaggle dataset.

Link: https://www.kaggle.com/datasets/bwandowando/meta-kaggle-users-stats

In [None]:
cols = ['UserId', 'UserName', 'RegisterDate', 'Country', 'PerformanceTier',  'DatasetsPerformanceTier',  'NotebooksPerformanceTier', 
        'Following', 'Followers', 'TopicsCreated',  'BronzeCompetitionMedals', 'SilverCompetitionMedals', 'GoldCompetitionMedals', 'LastContentShared', 
        'LastContentDate']
real_data = pd.read_csv('MetaKaggleUserStats.csv', nrows=100000, usecols=cols)

In [41]:
real_data.head()

Unnamed: 0,UserId,UserName,RegisterDate,Country,PerformanceTier,DatasetsPerformanceTier,NotebooksPerformanceTier,Following,Followers,TopicsCreated,BronzeCompetitionMedals,SilverCompetitionMedals,GoldCompetitionMedals,LastContentShared,LastContentDate
0,1,kaggleteam,2011-03-24,<Unknown>,Kaggle Staff,Grand Master,Contributor,0,724,0,0,0,0,Dataset,2021-07-16 01:23:05
1,368,antgoldbloom,2010-01-20,United States,Expert,Expert,Expert,6,1236,71,0,0,0,Dataset,2024-11-01 13:35:36
2,381,iguyon,2010-01-29,United States,Expert,Contributor,Contributor,0,8,55,0,0,0,Topic,2014-09-28 07:19:59
3,383,davidstephan,2010-02-01,Australia,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-01 00:00:00
4,384,gabewarren,2010-02-02,Australia,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-01 00:00:00


In [None]:
# Initialize the synthesizer
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_data)

# Update metadata to define UserName as primary key
metadata.update_column(
    column_name='UserName',
    sdtype='id'
)

metadata.set_primary_key(column_name='UserName')



In [None]:
#Train model
synthesizer = CTGANSynthesizer(
    metadata,
    enforce_rounding=True,
    epochs=200,
    verbose=True
)



In [32]:
print("Starting CTGAN training on Mac M3 Pro...")
synthesizer.fit(real_data)
print("Training complete.")

Starting CTGAN training on Mac M3 Pro...


Gen. (-1.07) | Discrim. (0.32): 100%|██████████| 200/200 [32:35<00:00,  9.78s/it] 

Training complete.





### Generate Synthetic Data

In [None]:
num_rows = len(real_data) # Number of rows in the real data
synthetic_data = synthesizer.sample(num_rows=num_rows)

In [40]:
synthetic_data.head()

Unnamed: 0,UserId,UserName,RegisterDate,Country,PerformanceTier,DatasetsPerformanceTier,NotebooksPerformanceTier,Following,Followers,TopicsCreated,BronzeCompetitionMedals,SilverCompetitionMedals,GoldCompetitionMedals,LastContentShared,LastContentDate
0,68991,sdv-id-LaJKjY,2013-02-18,United States,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-06 00:49:44
1,54447,sdv-id-sFXfFd,2012-11-06,United States,Expert,Contributor,Contributor,0,13,3,0,0,0,Submission,2018-11-06 03:44:07
2,50925,sdv-id-xyMvOE,2012-08-04,United States,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-01 00:00:00
3,64946,sdv-id-UXJEsp,2012-09-06,Ireland,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-01 00:00:00
4,78622,sdv-id-xLYwXS,2013-03-28,United States,Novice,Novice,Novice,0,0,0,0,0,0,<None>,1970-01-01 00:00:00


### Evaluate model

The evaluate_quality metric is a composite score (ranging from 0% to 100%) that measures statistical fidelity. It answers the question: "Mathematically, how close is this synthetic data to the original real data?"

How to interpret the score:

- 90%: Excellent. The data is statistically indistinguishable.
- 80-90%: Good. Suitable for Machine Learning and Analytics.
- < 70%: Poor. The model likely didn't train long enough or the data is too complex.

In [37]:
quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

print(f"\nOverall Quality Score: {quality_report.get_score() * 100:.2f}%")

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 49.80it/s]|
Column Shapes Score: 76.5%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 132.53it/s]|
Column Pair Trends Score: 73.7%

Overall Score (Average): 75.1%


Overall Quality Score: 75.10%
