# Clean Data As csv

## Load Libraries

In [1]:
import os
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
import sidetable as stb
import pprint
import yaml

## Read config

In [2]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

In [3]:
pprint.pprint(config["featurize"]["clean_joined_data"])

'data/processed/clean_join_data_parquet'


In [4]:
pprint.pprint(config["featurize"]["clean_data"])

'data/processed/clean_grouped_data.csv'


## Load parquet in DuckDB

In [5]:
import duckdb  
conn = duckdb.connect(':memory:')

In [6]:
parquet_file = '../' + config["featurize"]["clean_joined_data"]
conn.sql('CREATE VIEW evi_data AS SELECT * FROM read_parquet(\'' + parquet_file + '\')')

In [7]:
conn.execute('''  
SELECT 
    *
FROM
   evi_data
LIMIT 5
    
''').df()

Unnamed: 0,ID,REG_NO,ANTENNA,DISCOVER_TIME,DISCOVER_TIMESTAMP,DISCOVER_YEAR,DISCOVER_MONTH,DISCOVER_DAY,DISCOVER_HOUR,DISCOVER_MINUTE,IP_ADDRESS,LATITUDE,LONGITUDE,PAIR_NAME,NAME
0,483,F400B414A5F8EDECF7A05BDC2E524D13,2,2022-12-21 00:59:59.023,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.27,23.809342,90.421319,Kuril Bishawroad,To Bishaw Road New
1,482,A0C276312AF5E0F857DDA2001D2D879B,3,2022-12-21 00:59:58.693,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.26,23.809342,90.421319,Kuril Bishawroad,To Notun Baazar New
2,483,78260DE46F557E64458F7E25847ECE1C,0,2022-12-21 00:59:58.308,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.27,23.809342,90.421319,Kuril Bishawroad,To Bishaw Road New
3,462,D3E14E25EDF518F76A69E3562D345E69,0,2022-12-21 00:59:57.579,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.2,23.727862,90.410508,Zero Point,To Zero Point Circle New
4,341,31A7A65385B168E392B4D083D97A7046,2,2022-12-21 00:59:56.642,2022-12-21 00:59:00,2022,12,21,0,59,192.168.250.19,23.788739,90.400117,Mohakhali,To Mohakhali Circle New


## Group Data

In [33]:
clean_data_df = conn.execute('''  
SELECT 
    DISCOVER_YEAR,
    DISCOVER_MONTH,
    DISCOVER_DAY,
    DISCOVER_HOUR,
    IP_ADDRESS,
    LATITUDE,
    LONGITUDE,
    PAIR_NAME,
    NAME,
    COUNT(REG_NO) AS VEHICLES
FROM
   evi_data
GROUP BY DISCOVER_YEAR, DISCOVER_MONTH, DISCOVER_DAY, DISCOVER_HOUR, 
         IP_ADDRESS, LATITUDE, LONGITUDE, PAIR_NAME, NAME
''').df()

## Data QA

### Identify and Remove column variables that only have a single value.

In [34]:
print (clean_data_df.shape)

(2464, 10)


In [35]:
counts = clean_data_df.nunique(axis='index')

In [36]:
counts

DISCOVER_YEAR       1
DISCOVER_MONTH      1
DISCOVER_DAY        7
DISCOVER_HOUR      24
IP_ADDRESS         17
LATITUDE            9
LONGITUDE           9
PAIR_NAME           9
NAME               15
VEHICLES          879
dtype: int64

In [37]:
df = pd.DataFrame({'column_name':counts.index, 'value':counts.values})

In [38]:
single_value_columns= [item['column_name'] for index, item in  df.iterrows() if item['value'] == 1]

In [39]:
single_value_columns

['DISCOVER_YEAR', 'DISCOVER_MONTH']

In [40]:
clean_data_df =(clean_data_df
          .drop(single_value_columns, axis=1))

In [41]:
print (clean_data_df.shape)

(2464, 8)


### Handling Missing Values (min = 0) in numeric Column

In [42]:
statistics = clean_data_df.describe()
min_value_zero_columns = [item for item in statistics if statistics[item]['min'] == 0]
min_value_zero_columns

['DISCOVER_HOUR']

### Missing Values 

In [43]:
clean_data_df.isna().sum().sum()

0

In [44]:
clean_data_df.stb.missing()

Unnamed: 0,missing,total,percent
DISCOVER_DAY,0,2464,0.0
DISCOVER_HOUR,0,2464,0.0
IP_ADDRESS,0,2464,0.0
LATITUDE,0,2464,0.0
LONGITUDE,0,2464,0.0
PAIR_NAME,0,2464,0.0
NAME,0,2464,0.0
VEHICLES,0,2464,0.0


### Duplicate Row Check

In [45]:
clean_data_df.duplicated().sum()

0

## Save Clean Data

In [50]:
clean_data_df.shape

(2464, 8)

In [46]:
clean_data_df.sample(5)

Unnamed: 0,DISCOVER_DAY,DISCOVER_HOUR,IP_ADDRESS,LATITUDE,LONGITUDE,PAIR_NAME,NAME,VEHICLES
2126,16,1,192.168.250.58,23.782804,90.347026,Gabtoli Mazar Road,To Kollyanpur New,95
249,21,16,192.168.250.58,23.782804,90.347026,Gabtoli Mazar Road,To Kollyanpur New,200
282,21,14,192.168.250.26,23.809342,90.421319,Kuril Bishawroad,To Notun Baazar New,1102
899,18,8,192.168.250.2,23.727862,90.410508,Zero Point,To Zero Point Circle New,91
2354,15,15,192.168.250.2,23.727862,90.410508,Zero Point,To Zero Point Circle New,82


In [48]:
config["featurize"]["clean_data"]

'data/processed/clean_grouped_data.csv'

In [49]:
clean_data_df.to_csv('../' + config["featurize"]["clean_data"],  index=False) #header=False,