In [50]:
import warnings
warnings.filterwarnings("ignore")

import pkg_resources
import importlib
importlib.reload(pkg_resources)

import tensorflow as tf
import tensorflow_data_validation as tfdv
print('TF version:', tf.__version__)
print('TFDV version:', tfdv.version.__version__)
from tensorflow_data_validation.utils.display_util import get_statistics_html

TF version: 2.6.2
TFDV version: 1.3.0


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [52]:
df = pd.read_csv('./bonbanh.csv')

In [53]:
df.drop(['_id', 'title', 'url', 'post_date'], axis=1, inplace=True)

In [54]:
train_df, test_df = train_test_split(df, train_size=0.9, shuffle=True, random_state=43, stratify=df['branch'])

In [55]:
train_df, val_df = train_test_split(train_df, train_size=0.85, shuffle=True, random_state=43, stratify=train_df['branch'])

In [56]:
print("Train size: {}".format(len(train_df)))
print("Val size: {}".format(len(val_df)))
print("Test size: {}".format(len(test_df)))

Train size: 22400
Val size: 3954
Test size: 2929


In [57]:
train_df.head()

Unnamed: 0,year,price,location,branch,model,origin,km_driven,external_color,internal_color,num_seats,fuels,engine_capacity,gearbox,wheel_drive,car_type
11151,2021,435,Hà Nội,Kia,Soluto,domestic,26000,Xanh,Nhiều màu,5,gasoline,1.4,automatic,FWD,sedan
13545,2013,185,Đồng Nai,Kia,K2700,domestic,1000000,Xanh,Ghi,3,diesel,,manual,RWD,truck
15257,2008,80,Tiền Giang,Hyundai,County,domestic,0,Xanh,Nhiều màu,29,diesel,,manual,RWD,van
10370,2016,398,Hải Dương,Kia,Rio,imported,58000,Trắng,Kem,5,gasoline,1.4,automatic,FWD,sedan
9457,2018,1830,Hà Nội,Mercedes Benz,GLC,domestic,50000,Trắng,Nâu,5,gasoline,2.0,automatic,AWD,suv


In [58]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=train_df)

In [59]:
tfdv.visualize_statistics(train_stats)

In [26]:
file = get_statistics_html(train_stats)

In [27]:
with open('statics.html', 'w') as f:
    f.write(file)

In [60]:
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'year',INT,required,,-
'price',INT,required,,-
'location',STRING,required,,'location'
'branch',STRING,required,,'branch'
'model',BYTES,required,,-
'origin',STRING,required,,'origin'
'km_driven',INT,required,,-
'external_color',STRING,required,,'external_color'
'internal_color',STRING,required,,'internal_color'
'num_seats',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'location',"'An Giang', 'Bà Rịa Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Bình Thuận', 'Bình Định', 'Bạc Liêu', 'Bắc Giang', 'Bắc Kạn', 'Bắc Ninh', 'Bến Tre', 'Cao Bằng', 'Cà Mau', 'Cần Thơ', 'Gia Lai', 'Hà Giang', 'Hà Nam', 'Hà Nội', 'Hà Tĩnh', 'Hòa Bình', 'Hưng Yên', 'Hải Dương', 'Hải Phòng', 'Hậu Giang', 'Khánh Hòa', 'Kiên Giang', 'Kon Tum', 'Lai Châu', 'Long An', 'Lào Cai', 'Lâm Đồng', 'Lạng Sơn', 'Nam Định', 'Nghệ An', 'Ninh Bình', 'Ninh Thuận', 'Phú Thọ', 'Phú Yên', 'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi', 'Quảng Ninh', 'Quảng Trị', 'Sóc Trăng', 'Sơn La', 'TP HCM', 'Thanh Hóa', 'Thái Bình', 'Thái Nguyên', 'Thừa Thiên Huế', 'Tiền Giang', 'Trà Vinh', 'Tuyên Quang', 'Tây Ninh', 'Vĩnh Long', 'Vĩnh Phúc', 'Yên Bái', 'Điện Biên', 'Đà Nẵng', 'Đăk Lăk', 'Đăk Nông', 'Đồng Nai', 'Đồng Tháp'"
'branch',"'Acura', 'Audi', 'BMW', 'Chevrolet', 'Daewoo', 'Ford', 'Honda', 'Hyundai', 'Isuzu', 'Kia', 'LandRover', 'Lexus', 'MG', 'Mazda', 'Mercedes Benz', 'Mini', 'Mitsubishi', 'Nissan', 'Peugeot', 'Porsche', 'Subaru', 'Suzuki', 'Toyota', 'VinFast', 'Volkswagen', 'Volvo'"
'origin',"'domestic', 'imported'"
'external_color',"'-', 'Bạc', 'Cam', 'Cát', 'Ghi', 'Hồng', 'Kem', 'Màu khác', 'Nhiều màu', 'Nâu', 'Trắng', 'Tím', 'Vàng', 'Xanh', 'Xám', 'Đen', 'Đỏ', 'Đồng'"
'internal_color',"'-', 'Bạc', 'Cam', 'Cát', 'Ghi', 'Hồng', 'Kem', 'Màu khác', 'Nhiều màu', 'Nâu', 'Trắng', 'Tím', 'Vàng', 'Xanh', 'Xám', 'Đen', 'Đỏ', 'Đồng'"
'fuels',"'-', 'diesel', 'electric', 'gasoline', 'hybrid'"
'gearbox',"'-', 'Số hỗn hợp', 'automatic', 'manual'"
'wheel_drive',"'4WD', 'AWD', 'FWD', 'RWD'"
'car_type',"'convertible', 'coupe', 'crossover', 'hatchback', 'pickup', 'sedan', 'suv', 'truck', 'van', 'wagon'"


In [61]:
eva_stats = tfdv.generate_statistics_from_dataframe(dataframe=val_df)

tfdv.visualize_statistics(lhs_statistics=eva_stats,
                          rhs_statistics=train_stats,
                          lhs_name='VAL_DATASET',
                          rhs_name='TRAIN_DATASET')

In [62]:
anomalies = tfdv.validate_statistics(
    statistics=eva_stats,
    schema=schema
)

In [63]:
tfdv.display_anomalies(anomalies=anomalies)

In [64]:
serving_stats = tfdv.generate_statistics_from_dataframe(test_df)

serving_anomalies = tfdv.validate_statistics(serving_stats, schema)

tfdv.display_anomalies(serving_anomalies)

In [65]:
tfdv.get_feature(schema, 'car_type').skew_comparator.infinity_norm.threshold = 0.03

skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=serving_stats
)

In [66]:
tfdv.display_anomalies(skew_anomalies)