# Clean Joined Data As Parquet

## Load Libraries

In [1]:
import os
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
import sidetable as stb
import pprint
import yaml

## Read config

In [2]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

In [3]:
pprint.pprint(config["data_load"]["evi_dataset_csv"])

'data/raw/EVI_DATA_15TH_DEC_2022.csv'


## Load csv in DuckDB

In [4]:
import duckdb  
conn = duckdb.connect(':memory:')

In [5]:
#read all files with a name ending in ".csv" in the folder "dir"
#SELECT * FROM 'dir/*.csv';

In [6]:
evi_dataset = '../' + config["data_load"]["evi_dataset_csv"]
conn.sql('CREATE TABLE evi_data AS SELECT * FROM read_csv_auto(\'' + evi_dataset + '\')')

In [7]:
conn.execute('PRAGMA table_info(evi_data)').df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ID,BIGINT,False,,False
1,1,REG_NO,VARCHAR,False,,False
2,2,VEHICLE_CLASS,BIGINT,False,,False
3,3,VEHICLE_COLOR,VARCHAR,False,,False
4,4,VEHICLE_MODEL,VARCHAR,False,,False
5,5,IP_ADDRESS,VARCHAR,False,,False
6,6,ANTENNA,BIGINT,False,,False
7,7,DISCOVER_TIME,VARCHAR,False,,False


In [8]:
evi_reader = '../' + config["data_load"]["reader_dataset_csv"]
conn.sql('CREATE TABLE reader AS SELECT * FROM read_csv_auto(\'' + evi_reader + '\')')

In [9]:
conn.execute('PRAGMA table_info(reader)').df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,NAME,VARCHAR,False,,False
1,1,IP,VARCHAR,False,,False
2,2,LONGITUDE,DOUBLE,False,,False
3,3,LATITUDE,DOUBLE,False,,False
4,4,ID,BIGINT,False,,False


In [10]:
evi_reader_pair = '../' + config["data_load"]["reader_pair_dataset_csv"]
conn.sql('CREATE VIEW reader_pair AS SELECT * FROM read_csv_auto(\'' + evi_reader_pair + '\')')

In [11]:
conn.execute('PRAGMA table_info(reader_pair)').df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ID,BIGINT,False,,False
1,1,READER1,BIGINT,False,,False
2,2,READER2,BIGINT,False,,False
3,3,PAIR_NAME,VARCHAR,False,,False


In [12]:
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,evi_data
1,reader
2,reader_pair


In [13]:
conn.execute('''  
SELECT 
    *
FROM
   evi_data
LIMIT 5
    
''').df()

Unnamed: 0,ID,REG_NO,VEHICLE_CLASS,VEHICLE_COLOR,VEHICLE_MODEL,IP_ADDRESS,ANTENNA,DISCOVER_TIME
0,748284733,8841CE709363F49C4DE4DAAC282B22FE,12,WHITE,TOYOTA M. CROP.,192.168.250.10,0,21-DEC-22 11.59.59.963000000 PM
1,748282990,A633B351E3D225CC555D4B25359E9835,28,MULTI COLOR,EICHER MOTORS LTD.,192.168.250.26,1,21-DEC-22 11.59.59.606000000 PM
2,748284600,9632CB632F41138B9F05629249BA1DBC,28,MULTI COLOR,ISUZU MOTOR LIMITED,192.168.250.2,0,21-DEC-22 11.59.59.598000000 PM
3,748288503,2B30E2AA753F687A4DBB95CDF8E03915,3,RED,HONDA INDIA PVT.LTD,192.168.250.50,3,21-DEC-22 11.59.58.579000000 PM
4,748282986,580F870FA7F9C3288D7C69C70FCEA34C,3,BLACK,BAJAJ AUTO LTD.INDIA,192.168.250.26,3,21-DEC-22 11.59.57.683000000 PM


## String to Timestamp conversion

In [14]:
conn.execute('''  
SELECT 
    STRPTIME(SUBSTR(DISCOVER_TIME, 1, 9) || SUBSTR(DISCOVER_TIME, 10, 16) || 
    SUBSTR(DISCOVER_TIME, 29, 3), '%d-%b-%y %H.%M.%S.%f %p') as DISCOVER_TIME,
    
    STRPTIME(SUBSTR(DISCOVER_TIME, 1, 9) || SUBSTR(DISCOVER_TIME, 10, 6) || 
    SUBSTR(DISCOVER_TIME, 29, 3), '%d-%b-%y %H.%M %p') as DISCOVER_TIMESTAMP 
FROM
    evi_data
LIMIT 5
''').df()

Unnamed: 0,DISCOVER_TIME,DISCOVER_TIMESTAMP
0,2022-12-21 23:59:59.963,2022-12-21 23:59:00
1,2022-12-21 23:59:59.606,2022-12-21 23:59:00
2,2022-12-21 23:59:59.598,2022-12-21 23:59:00
3,2022-12-21 23:59:58.579,2022-12-21 23:59:00
4,2022-12-21 23:59:57.683,2022-12-21 23:59:00


## Inner join among evi_data, reader, and reader_pair table

In [15]:
conn.sql('''  
CREATE TABLE data_table AS
SELECT 
    evi_data.reg_no,
    evi_data.antenna,
    STRPTIME(SUBSTR(DISCOVER_TIME, 1, 9) || SUBSTR(DISCOVER_TIME, 10, 16) || 
    SUBSTR(DISCOVER_TIME, 29, 3), '%d-%b-%y %H.%M.%S.%f %p') as DISCOVER_TIME,  
    STRPTIME(SUBSTR(DISCOVER_TIME, 1, 9) || SUBSTR(DISCOVER_TIME, 10, 6) || 
    SUBSTR(DISCOVER_TIME, 29, 3), '%d-%b-%y %H.%M %p') as DISCOVER_TIMESTAMP,
    evi_data.IP_ADDRESS,
    reader.id,
    reader.longitude,
    reader.latitude,
    reader.name,
    pair.reader1,
    pair.reader2,
    pair.pair_name
FROM
    reader, evi_data,
    (SELECT
    reader_pair.reader1 reader1,
    reader_pair.reader2 reader2,
    reader_pair.pair_name
    FROM
    reader_pair) pair
WHERE 
     TRIM(evi_data.ip_address) = TRIM(reader.ip)
AND
    (pair.reader1 = reader.id 
    OR pair.reader2 = reader.id)
ORDER BY evi_data.discover_time DESC    
''')

In [16]:
conn.execute('''  
SELECT 
    *
FROM
   data_table
LIMIT 5
    
''').df()

Unnamed: 0,REG_NO,ANTENNA,DISCOVER_TIME,DISCOVER_TIMESTAMP,IP_ADDRESS,ID,LONGITUDE,LATITUDE,NAME,reader1,reader2,PAIR_NAME
0,F400B414A5F8EDECF7A05BDC2E524D13,2,2022-12-21 00:59:59.023,2022-12-21 00:59:00,192.168.250.27,483,90.421319,23.809342,To Bishaw Road New,482,483,Kuril Bishawroad
1,A0C276312AF5E0F857DDA2001D2D879B,3,2022-12-21 00:59:58.693,2022-12-21 00:59:00,192.168.250.26,482,90.421319,23.809342,To Natun Bazar New,482,483,Kuril Bishawroad
2,78260DE46F557E64458F7E25847ECE1C,0,2022-12-21 00:59:58.308,2022-12-21 00:59:00,192.168.250.27,483,90.421319,23.809342,To Bishaw Road New,482,483,Kuril Bishawroad
3,D3E14E25EDF518F76A69E3562D345E69,0,2022-12-21 00:59:57.579,2022-12-21 00:59:00,192.168.250.2,462,90.410508,23.727862,To Zero Point Circle New,462,494,Zero Point
4,31A7A65385B168E392B4D083D97A7046,2,2022-12-21 00:59:56.642,2022-12-21 00:59:00,192.168.250.19,341,90.400117,23.788739,To Mohakhali Circle New,481,341,Mohakhali


In [17]:
conn.execute('PRAGMA table_info(data_table)').df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,REG_NO,VARCHAR,False,,False
1,1,ANTENNA,BIGINT,False,,False
2,2,DISCOVER_TIME,TIMESTAMP,False,,False
3,3,DISCOVER_TIMESTAMP,TIMESTAMP,False,,False
4,4,IP_ADDRESS,VARCHAR,False,,False
5,5,ID,BIGINT,False,,False
6,6,LONGITUDE,DOUBLE,False,,False
7,7,LATITUDE,DOUBLE,False,,False
8,8,NAME,VARCHAR,False,,False
9,9,reader1,BIGINT,False,,False


## Categorical Value Check

In [18]:
conn.execute('''  
SELECT 
    NAME,
    COUNT(NAME) As Values
FROM
   data_table
GROUP BY NAME
    
''').df()

Unnamed: 0,NAME,Values
0,To Natun Bazar New,131657
1,To Kakoli New,184183
2,To Notun Baazar New,53781
3,To Mohakhali Circle New,133647
4,To Bishaw Road New,111821
5,To Aminbazar New,60351
6,To Shahbag Circle New,35571
7,To Kakrail Mosque New,14660
8,To Kollyanpur New,30688
9,To Paltan New,22595


In [19]:
conn.sql('UPDATE data_table SET NAME = \'To Notun Baazar New\' WHERE NAME=\'To Natun Bazar New\'')

In [20]:
conn.execute('''  
SELECT 
    NAME,
    COUNT(NAME) As Values
FROM
   data_table
GROUP BY NAME
    
''').df()

Unnamed: 0,NAME,Values
0,To Bishaw Road New,111821
1,To Notun Baazar New,185438
2,To Zero Point Circle New,11415
3,To Mohakhali Circle New,133647
4,To Kollyanpur New,30688
5,To Motsho Bhaban New,35002
6,To Aminbazar New,60351
7,To Kakoli New,184183
8,To Shahbag Circle New,35571
9,To New Market New,3614


In [21]:
conn.execute('''  
SELECT 
    PAIR_NAME,
    COUNT(PAIR_NAME) As Values
FROM
   data_table
GROUP BY PAIR_NAME
    
''').df()

Unnamed: 0,PAIR_NAME,Values
0,Kuril Bishawroad,243478
1,Gabtoli Mazar Road,91039
2,Mohakhali,258344
3,Shahbag Circle,70573
4,Zero Point,34010
5,Kakrail,14849
6,Gulshan 2,113267
7,Buriganga Bridge,2964
8,Science Lab,27117


## New Features (Year, Month, Day, etc.)

In [22]:
conn.sql('''
CREATE TABLE clean_join_data AS
SELECT 
    ID,
    REG_NO,
    ANTENNA,
    DISCOVER_TIME,
    DISCOVER_TIMESTAMP,
    YEAR(DISCOVER_TIMESTAMP) as DISCOVER_YEAR, 
    MONTH(DISCOVER_TIMESTAMP) as DISCOVER_MONTH,
    DAY(DISCOVER_TIMESTAMP) as DISCOVER_DAY, 
    HOUR(DISCOVER_TIMESTAMP) as DISCOVER_HOUR,
    MINUTE(DISCOVER_TIMESTAMP) as DISCOVER_MINUTE,
    IP_ADDRESS,
    LATITUDE,
    LONGITUDE,
    PAIR_NAME,
    NAME
FROM
   data_table
''')

In [23]:
conn.execute('''  
SELECT 
    *
FROM
   clean_join_data
LIMIT 5
    
''').df()

Unnamed: 0,ID,REG_NO,ANTENNA,DISCOVER_TIME,DISCOVER_TIMESTAMP,DISCOVER_YEAR,DISCOVER_MONTH,DISCOVER_DAY,DISCOVER_HOUR,DISCOVER_MINUTE,IP_ADDRESS,LATITUDE,LONGITUDE,PAIR_NAME,NAME
0,483,F400B414A5F8EDECF7A05BDC2E524D13,2,2022-12-21 00:59:59.023,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.27,23.809342,90.421319,Kuril Bishawroad,To Bishaw Road New
1,482,A0C276312AF5E0F857DDA2001D2D879B,3,2022-12-21 00:59:58.693,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.26,23.809342,90.421319,Kuril Bishawroad,To Notun Baazar New
2,483,78260DE46F557E64458F7E25847ECE1C,0,2022-12-21 00:59:58.308,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.27,23.809342,90.421319,Kuril Bishawroad,To Bishaw Road New
3,462,D3E14E25EDF518F76A69E3562D345E69,0,2022-12-21 00:59:57.579,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.2,23.727862,90.410508,Zero Point,To Zero Point Circle New
4,341,31A7A65385B168E392B4D083D97A7046,2,2022-12-21 00:59:56.642,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.19,23.788739,90.400117,Mohakhali,To Mohakhali Circle New


## Save Clean Joined Data as parquet format

In [24]:
parquet_file = '../' + config["featurize"]["clean_joined_data"]
conn.sql('COPY clean_join_data TO \'' + parquet_file + '\'(FORMAT PARQUET)')

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))