In [2]:
# 1. Log a dataset version as an artifact
import os
import wandb
import pandas as pd
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split

from components.data_extraction import fetch_data
# Initialize a new W&B run to track this job
run = wandb.init(project="mlops-test", job_type="infer-schema")

# Create a sample dataset to log as an artifact
artifact = run.use_artifact('raw-dataset:latest')
artifact_data = artifact.get("raw-dataset")
df = pd.DataFrame(columns=artifact_data.columns, data=artifact_data.data)

df = df.loc[df['external_color'] != '-']
df = df.loc[df['internal_color'] != '-']
df = df.loc[df['fuels'] != '-']
df = df.loc[df['gearbox'] != '-']
df = df.loc[df['wheel_drive'] != '']
df = df.loc[df['wheel_drive'] != '4WD hoặc AWD']

df.drop(['_id', 'title', 'url', 'post_date'], axis=1, inplace=True)
train_df, test_df = train_test_split(df, train_size=0.9, shuffle=True, random_state=43, stratify=df['branch'])
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=train_df)
schema = tfdv.infer_schema(statistics=train_stats)
schema_df_result = tfdv.utils.display_util.get_schema_dataframe(schema=schema)

# log schema
artifact = wandb.Artifact('categorical-schema', type='Schema')
categorical_schema = schema_df_result[1].reset_index()
schema_table = wandb.Table(data=categorical_schema, columns=categorical_schema.columns)
artifact.add(schema_table, 'categorical-schema-table')
run.log_artifact(artifact)

artifact = wandb.Artifact('data-schema', type='Schema')
data_schema = schema_df_result[0].reset_index()
schema_table = wandb.Table(data=data_schema, columns=data_schema.columns)
artifact.add(schema_table, 'data-schema-table')
run.log_artifact(artifact)

tfdv.write_schema_text(schema=schema, output_path=os.path.join('artifacts_raw/schema.txt'))
artifact = wandb.Artifact('text-schema', type='Schema')
artifact.add_file(os.path.join('artifacts_raw/schema.txt'))
run.log_artifact(artifact)

wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [4]:
run = wandb.init(project="mlops-test", job_type="download-schema")

# Pull down that dataset you logged in the last run
artifact = run.use_artifact('text-schema:latest')
artifact_text = artifact.get('schema')

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [10]:
artifact_text = artifact.get('schema.txt')

In [6]:
artifact_dir = artifact.download()

In [12]:
schema = tfdv.load_schema_text(os.path.join(artifact_dir, 'schema.txt'))

In [11]:
artifact_text

In [13]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'year',INT,required,,-
'price',INT,required,,-
'location',STRING,required,,'location'
'branch',STRING,required,,'branch'
'model',BYTES,required,,-
'origin',STRING,required,,'origin'
'km_driven',INT,required,,-
'external_color',STRING,required,,'external_color'
'internal_color',STRING,required,,'internal_color'
'num_seats',INT,required,,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'location',"'An Giang', 'Bà Rịa Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Bình Thuận', 'Bình Định', 'Bạc Liêu', 'Bắc Giang', 'Bắc Kạn', 'Bắc Ninh', 'Bến Tre', 'Cao Bằng', 'Cà Mau', 'Cần Thơ', 'Gia Lai', 'Hà Giang', 'Hà Nam', 'Hà Nội', 'Hà Tĩnh', 'Hòa Bình', 'Hưng Yên', 'Hải Dương', 'Hải Phòng', 'Hậu Giang', 'Khánh Hòa', 'Kiên Giang', 'Kon Tum', 'Lai Châu', 'Long An', 'Lào Cai', 'Lâm Đồng', 'Lạng Sơn', 'Nam Định', 'Nghệ An', 'Ninh Bình', 'Ninh Thuận', 'Phú Thọ', 'Phú Yên', 'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi', 'Quảng Ninh', 'Quảng Trị', 'Sóc Trăng', 'Sơn La', 'TP HCM', 'Thanh Hóa', 'Thái Bình', 'Thái Nguyên', 'Thừa Thiên Huế', 'Tiền Giang', 'Trà Vinh', 'Tuyên Quang', 'Tây Ninh', 'Vĩnh Long', 'Vĩnh Phúc', 'Yên Bái', 'Điện Biên', 'Đà Nẵng', 'Đăk Lăk', 'Đăk Nông', 'Đồng Nai', 'Đồng Tháp'"
'branch',"'Acura', 'Audi', 'BMW', 'Chevrolet', 'Daewoo', 'Ford', 'Honda', 'Hyundai', 'Isuzu', 'Kia', 'LandRover', 'Lexus', 'MG', 'Mazda', 'Mercedes Benz', 'Mini', 'Mitsubishi', 'Nissan', 'Peugeot', 'Porsche', 'Subaru', 'Suzuki', 'Toyota', 'VinFast', 'Volkswagen', 'Volvo'"
'origin',"'domestic', 'imported'"
'external_color',"'Bạc', 'Cam', 'Cát', 'Ghi', 'Hồng', 'Kem', 'Màu khác', 'Nhiều màu', 'Nâu', 'Trắng', 'Tím', 'Vàng', 'Xanh', 'Xám', 'Đen', 'Đỏ', 'Đồng'"
'internal_color',"'Bạc', 'Cam', 'Cát', 'Ghi', 'Hồng', 'Kem', 'Màu khác', 'Nhiều màu', 'Nâu', 'Trắng', 'Tím', 'Vàng', 'Xanh', 'Xám', 'Đen', 'Đỏ', 'Đồng'"
'fuels',"'diesel', 'electric', 'gasoline', 'hybrid'"
'gearbox',"'Số hỗn hợp', 'automatic', 'manual'"
'wheel_drive',"'4WD', 'AWD', 'FWD', 'RWD'"
'car_type',"'convertible', 'coupe', 'crossover', 'hatchback', 'pickup', 'sedan', 'suv', 'truck', 'van', 'wagon'"
