In [5]:
# 1. Log a dataset version as an artifact
import os
import wandb

from components.data_extraction import fetch_data
# Initialize a new W&B run to track this job
run = wandb.init(project="mlops-test", job_type="data-extraction")

# Create a sample dataset to log as an artifact
df = fetch_data()

# log data artifacts
dataset_artifact = wandb.Artifact('raw-dataset', type='dataset')
dataset_table = wandb.Table(data=df, columns=df.columns)
dataset_artifact.add(dataset_table, 'raw-dataset')
run.log_artifact(dataset_artifact)

wandb.finish()

VBox(children=(Label(value='8.463 MB of 8.463 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [25]:
run = wandb.init(project="mlops-test", job_type="download-data")

# Pull down that dataset you logged in the last run
artifact = run.use_artifact('raw-dataset:latest')
artifact_dir = artifact

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import wandb

import pkg_resources
import importlib
importlib.reload(pkg_resources)

import tensorflow as tf
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split
from tensorflow_data_validation.utils.display_util import get_statistics_html

run = wandb.init(project="mlops-test", job_type="data-validation")

# Pull down that dataset you logged in the last run
artifact = run.use_artifact('raw-dataset:latest')
artifact_data = artifact.get("raw-dataset")
df = pd.DataFrame(columns=artifact_data.columns, data=artifact_data.data)

df.drop(['_id', 'title', 'url', 'post_date'], axis=1, inplace=True)
train_df, test_df = train_test_split(df, train_size=0.9, shuffle=True, random_state=43, stratify=df['branch'])
train_df, val_df = train_test_split(train_df, train_size=0.85, shuffle=True, random_state=43,
                                    stratify=train_df['branch'])

# generate statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=train_df)
eva_stats = tfdv.generate_statistics_from_dataframe(dataframe=val_df)
serving_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_df)

# log statistics
file = get_statistics_html(lhs_statistics=eva_stats,
                           rhs_statistics=train_stats,
                           lhs_name='VAL_DATASET',
                           rhs_name='TRAIN_DATASET')
artifact = wandb.Artifact('statistic', type='Statistic')
html = wandb.Html(data=file)
artifact.add(html, 'Statistic')
run.log_artifact(artifact)

# infer schema
schema = tfdv.infer_schema(statistics=train_stats)
schema_df_result = tfdv.utils.display_util.get_schema_dataframe(schema=schema)

# log schema
artifact = wandb.Artifact('categorical-schema', type='Schema')
categorical_schema = schema_df_result[1].reset_index()
schema_table = wandb.Table(data=categorical_schema, columns=categorical_schema.columns)
artifact.add(schema_table, 'categorical-schema-table')
run.log_artifact(artifact)

artifact = wandb.Artifact('data-schema', type='Schema')
data_schema = schema_df_result[0].reset_index()
schema_table = wandb.Table(data=data_schema, columns=data_schema.columns)
artifact.add(schema_table, 'data-schema-table')
run.log_artifact(artifact)

# detect anomalies
val_anomalies = tfdv.validate_statistics(
    statistics=eva_stats,
    schema=schema
)
val_anomalies = tfdv.utils.display_util.get_anomalies_dataframe(val_anomalies).reset_index()

serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
serving_anomalies = tfdv.utils.display_util.get_anomalies_dataframe(serving_anomalies).reset_index()

# log anomalies
anomalies_table = wandb.Table(data=val_anomalies, columns=val_anomalies.columns)
run.log({"Val anomalies": anomalies_table})

anomalies_table = wandb.Table(data=serving_anomalies, columns=serving_anomalies.columns)
run.log({"Serving anomalies": anomalies_table})

from sklearn.model_selection import train_test_split
wandb.finish()

VBox(children=(Label(value='0.060 MB of 0.060 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.938248…

In [5]:
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split


run = wandb.init(project="mlops-test", job_type="download-data")

# Pull down that dataset you logged in the last run
artifact = run.use_artifact('raw-dataset:latest')
artifact_data = artifact.get("raw-dataset")
df = pd.DataFrame(columns=artifact_data.columns, data=artifact_data.data)

df.drop(['_id', 'title', 'url', 'post_date'], axis=1, inplace=True)
train_df, test_df = train_test_split(df, train_size=0.9, shuffle=True, random_state=43, stratify=df['branch'])
train_df, val_df = train_test_split(train_df, train_size=0.85, shuffle=True, random_state=43,
                                    stratify=train_df['branch'])

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [6]:
train_df.head()

Unnamed: 0,year,price,location,branch,model,origin,km_driven,external_color,internal_color,num_seats,fuels,engine_capacity,gearbox,wheel_drive,car_type
2434,2019,599,Hải Phòng,Mazda,3,domestic,33000,Trắng,Đen,5,gasoline,1.5,automatic,FWD,sedan
24169,2017,640,TP HCM,Toyota,Innova,domestic,69600,Bạc,Nâu,8,gasoline,2.0,automatic,RWD,crossover
23476,2017,640,TP HCM,Toyota,Innova,domestic,64000,Đồng,Cam,8,gasoline,2.0,automatic,RWD,crossover
26435,2013,368,Bình Dương,Toyota,Innova,domestic,0,Bạc,Ghi,8,gasoline,2.0,manual,RWD,crossover
7907,2021,4099,Hà Nội,Mercedes Benz,S class,domestic,5000,Trắng,Kem,5,gasoline,3.0,automatic,RWD,sedan


In [7]:
train_df.to_csv('train_data.csv')

In [8]:
val_df.to_csv('val_data.csv')

In [9]:
test_df.to_csv('test_data.csv')