References:
SDV is a public, source-available Python library for generating and evaluating synthetic data. You can download and use it under the Business Source License.

License: https://github.com/sdv-dev/SDV/blob/main/LICENSE

In [1]:
import pandas as pd

masterdf = pd.read_csv(
    '../data/clean_olist_data.csv'
)

masterdf.head()

Unnamed: 0,order_id,timestamp,user_id,customer_city,product_category,product_id,quantity,price,review_score
0,e481f51cbdc54678b7cc49136f2d6af7,1506941793,7c396fd4830fd04220f754e42b4e5bff,sao paulo,housewares,housewares SKU 0,1.0,29.99,4.0
1,53cdb2fc8bc7dce0b6741e2150273451,1532464897,af07308b275d755c9edb36a90c618231,barreiras,perfumery,perfumery SKU 0,1.0,118.7,4.0
2,47770eb9100c2d0c44946d9cf07ec65d,1533717529,3a653a41f6f9fc3d2a113cf8398680e8,vianopolis,auto,auto SKU 0,1.0,159.9,5.0
3,949d5b44dbf5de918fe9c16f97b45f8a,1511033286,7c142cf63193a1473d2e66489a9ae977,sao goncalo do amarante,pet_shop,pet_shop SKU 0,1.0,45.0,5.0
4,ad21c59c0840e6cb83a9ceb5573f8159,1518556719,72632f0f9dd73dfee390c9b22eb56dd6,santo andre,stationery,stationery SKU 0,1.0,19.9,5.0


In [2]:
masterdf.shape

(102425, 9)

In [3]:
masterdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102425 entries, 0 to 102424
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   order_id          102425 non-null  object 
 1   timestamp         102425 non-null  int64  
 2   user_id           102425 non-null  object 
 3   customer_city     102425 non-null  object 
 4   product_category  102425 non-null  object 
 5   product_id        102425 non-null  object 
 6   quantity          102425 non-null  float64
 7   price             102425 non-null  float64
 8   review_score      102425 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 7.0+ MB


In [4]:
masterdf[masterdf['order_id']=='ca3625898fbd48669d50701aba51cd5f']

Unnamed: 0,order_id,timestamp,user_id,customer_city,product_category,product_id,quantity,price,review_score
61603,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 52,1.0,33.9,3.0
61604,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 281,2.0,159.0,3.0
61605,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 146,1.0,309.0,3.0
61606,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 282,1.0,63.7,3.0
61607,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 239,2.0,56.0,3.0
61608,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 38,1.0,109.9,3.0
61609,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 28,1.0,95.9,3.0
61610,ca3625898fbd48669d50701aba51cd5f,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,construction_tools_construction,construction_tools_construction SKU 29,1.0,95.9,3.0


In [5]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(masterdf)
if metadata.validate() is not None:
    print("Problems with metadata auto-detection!")
python_dict = metadata.to_dict()
python_dict

{'columns': {'order_id': {'sdtype': 'unknown', 'pii': True},
  'timestamp': {'sdtype': 'numerical'},
  'user_id': {'sdtype': 'unknown', 'pii': True},
  'customer_city': {'sdtype': 'city', 'pii': True},
  'product_category': {'sdtype': 'categorical'},
  'product_id': {'sdtype': 'unknown', 'pii': True},
  'quantity': {'sdtype': 'numerical'},
  'price': {'sdtype': 'numerical'},
  'review_score': {'sdtype': 'numerical'}},
 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}

In [6]:
all_cols = metadata.get_column_names()
all_cat_cols = list(set(all_cols) - set(['order_id']))

In [7]:
from sdv.single_table import GaussianCopulaSynthesizer

metadata.update_columns(
    column_names=all_cat_cols,
    sdtype='categorical',
)
synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
)
my_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['timestamp', 'user_id', 'customer_city']
    }
}
synthesizer.add_constraints(constraints=[
    my_constraint
])
synthesizer.fit(masterdf)
synthetic_data = synthesizer.sample(num_rows=10000000)
synthetic_data.head()

Sampling rows: 100%|██████████| 10000000/10000000 [04:15<00:00, 39137.52it/s]


Unnamed: 0,order_id,timestamp,user_id,customer_city,product_category,product_id,quantity,price,review_score
0,sdv-pii-m7p96,1519589426,6e64dc929d62442c813375d68c4e0c2e,paulinia,luggage_accessories,cine_photo SKU 21,1.0,15.9,5.0
1,sdv-pii-g7lj0,1516314925,2010254934075418afe8437eb709a885,candeias,bed_bath_table,bed_bath_table SKU 2242,1.0,27.9,5.0
2,sdv-pii-g2nag,1512502115,aa811400e154ab32b7f17bbf8084d564,belo horizonte,computers_accessories,computers_accessories SKU 628,1.0,55.0,5.0
3,sdv-pii-m3eiw,1525725412,752973ef5ea4849ddb6f8091746af022,sao paulo,auto,computers_accessories SKU 725,1.0,17.45,5.0
4,sdv-pii-oxl31,1510947925,b698379306c3fad0ab64d82a63a770a2,leme,furniture_decor,sports_leisure SKU 2078,1.0,81.49,5.0


In [8]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata.load_from_dict(python_dict)
if metadata.validate() is not None:
    print("Problems with resetting metadata detected!")

In [9]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic_report = run_diagnostic(
    real_data=masterdf,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 9/9 [00:00<00:00, 12.31it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 460.66it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [10]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=masterdf,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:05<00:00,  1.78it/s]|
Column Shapes Score: 99.95%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:25<00:00,  1.43it/s]|
Column Pair Trends Score: 97.54%

Overall Score (Average): 98.74%



In [11]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,timestamp,KSComplement,0.999795
1,product_category,TVComplement,0.998906
2,quantity,KSComplement,0.999702
3,price,KSComplement,0.999698
4,review_score,KSComplement,0.99926


In [20]:
import hashlib

synthetic_data['combined_values'] = str(synthetic_data['timestamp']) + '-' + synthetic_data['user_id'] + '-' + synthetic_data['customer_city']
synthetic_data['order_id'] = synthetic_data['combined_values'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest()[:32])
synthetic_data = synthetic_data.drop(columns=['combined_values'])
synthetic_data.drop_duplicates(inplace=True)

In [21]:
synthetic_data[['timestamp', 'user_id', 'customer_city']].value_counts().reset_index().loc[0]

timestamp                              1534039880
user_id          c8ed31310fc440a3f8031b177f9842c3
customer_city                                ipua
count                                         812
Name: 0, dtype: object

In [22]:
synthetic_data[synthetic_data["user_id"] == "c8ed31310fc440a3f8031b177f9842c3"]

Unnamed: 0,order_id,timestamp,user_id,customer_city,product_category,product_id,quantity,price,review_score
24,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,musical_instruments,pet_shop SKU 72,1.0,8.99,3.0
10359,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,toys,costruction_tools_garden SKU 15,1.0,140.00,4.0
16760,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,health_beauty,health_beauty SKU 770,1.0,89.90,4.0
27150,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,fashion_bags_accessories,bed_bath_table SKU 2478,1.0,129.90,4.0
40015,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,computers_accessories,sports_leisure SKU 824,1.0,16.17,5.0
...,...,...,...,...,...,...,...,...,...
9955725,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,cool_stuff,small_appliances SKU 134,1.0,24.90,3.0
9961376,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,housewares,perfumery SKU 0,1.0,16.90,5.0
9962153,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,furniture_decor,computers_accessories SKU 221,1.0,89.90,5.0
9984023,3da13f7c136e75165df54af387530460,1534039880,c8ed31310fc440a3f8031b177f9842c3,ipua,furniture_decor,home_construction SKU 200,1.0,23.00,3.0


In [23]:
len(synthetic_data)

9999978

In [24]:
synthetic_data.to_csv('../data/all_transaction_data.csv', index=False)