## Sanity Check

##### The section show class involve in processing

In [1]:
from pnet.core import core
import pandas as pd
import duckdb

# customize class that store functionality that assist generate basis data handling
_util = core.Util()
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
pd_result = duckdb.read_parquet('/code/data/ingestion/final/*.parquet').to_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

##### Display detailed information about the DataFrame, including column names, non-null counts, and data types.

##### This method provides a concise summary of the DataFrame's structure, helping to quickly understand 
##### the dataset's composition, number of entries, and the types of data in each column.

In [3]:
pd_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 26 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Unnamed: 0              1296675 non-null  object 
 1   trans_date_trans_time   1296675 non-null  object 
 2   cc_num                  1296675 non-null  object 
 3   merchant                1296675 non-null  object 
 4   category                1296675 non-null  object 
 5   amt                     1296675 non-null  float64
 6   first                   1296675 non-null  object 
 7   last                    1296675 non-null  object 
 8   gender                  1296675 non-null  object 
 9   street                  1296675 non-null  object 
 10  city                    1296675 non-null  object 
 11  state                   1296675 non-null  object 
 12  zip                     1296675 non-null  object 
 13  lat                     1296675 non-null  object 
 14  lo

##### Retrieve rows in a DataFrame where at least one column contains a NaN (missing) value.
##### Returns a subset of the DataFrame containing only rows with missing data.

In [4]:
pd_result.loc[pd_result.isna().any(axis="columns")]

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,job,dob,trans_num,merch_lat,merch_long,is_fraud,merch_zipcode,merch_last_update_time,merch_eff_time,cc_bic


##### Checking null values

In [5]:
pd_result.isnull().sum()

Unnamed: 0                0
trans_date_trans_time     0
cc_num                    0
merchant                  0
category                  0
amt                       0
first                     0
last                      0
gender                    0
street                    0
city                      0
state                     0
zip                       0
lat                       0
long                      0
city_pop                  0
job                       0
dob                       0
trans_num                 0
merch_lat                 0
merch_long                0
is_fraud                  0
merch_zipcode             0
merch_last_update_time    0
merch_eff_time            0
cc_bic                    0
dtype: int64

##### Generates descriptive statistical summary of the DataFrame.

##### Returns:
#####  - DataFrame: A statistical summary including count, mean, standard deviation, minimum, 25th percentile, median, 75th percentile, and maximum for each numeric column.

In [6]:
pd_result.describe(include='all')

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,job,dob,trans_num,merch_lat,merch_long,is_fraud,merch_zipcode,merch_last_update_time,merch_eff_time,cc_bic
count,1296675.0,1296675,1296675,1296675,1296675,1296675.0,1296675,1296675.0,1296675,1296675,...,1296675,1296675,1296675,1296675.0,1296675.0,1296675.0,1296675.0,1296675,1296675,1296675.0
unique,1296675.0,1274791,983,693,14,,352,482.0,2,983,...,494,968,1296675,1247805.0,1275745.0,2.0,28337.0,1274823,1274823,7.0
top,1162223.0,2020-06-02 12:47:07+08:00,451282#########1773,Kilback LLC,gas_transport,,Christopher,,F,864 Reynolds Plains,...,Film/video editor,1977-03-23,000088fe170f044d2ed28c570282c7a4,40.456305,-87.116414,0.0,,2012-04-22 16:02:01+08:00,2012-04-22 16:02:01+08:00,
freq,1.0,4,3123,4403,131659,,26669,216717.0,709863,3123,...,9779,5636,1,4.0,4.0,1289169.0,195973.0,4,4,432980.0
mean,,,,,,70.35104,,,,,...,,,,,,,,,,
std,,,,,,160.31604,,,,,...,,,,,,,,,,
min,,,,,,1.0,,,,,...,,,,,,,,,,
25%,,,,,,9.65,,,,,...,,,,,,,,,,
50%,,,,,,47.52,,,,,...,,,,,,,,,,
75%,,,,,,83.14,,,,,...,,,,,,,,,,
