# Clean Data As csv

## Load Libraries

In [1]:
import os
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np
import seaborn as sns
import sidetable as stb
import pprint
import yaml

## Read config

In [2]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

In [3]:
pprint.pprint(config["featurize"]["clean_joined_data"])

'data/processed/clean_join_data_parquet'


In [4]:
pprint.pprint(config["featurize"]["clean_data"])

'data/processed/clean_grouped_data.csv'


## Load parquet in DuckDB

In [5]:
import duckdb  
conn = duckdb.connect(':memory:')

In [6]:
parquet_file = '../' + config["featurize"]["clean_joined_data"]
conn.sql('CREATE VIEW evi_data AS SELECT * FROM read_parquet(\'' + parquet_file + '\')')

In [7]:
conn.execute('''  
SELECT 
    COUNT(*)
FROM
   evi_data
    
''').df()

Unnamed: 0,count_star()
0,35155317


In [None]:
conn.execute('''  
SELECT 
    *
FROM
   evi_data
LIMIT 5   
''').df()

In [9]:
conn.execute('''  
SELECT 
    MAX(DISCOVER_TIME),
    MIN(DISCOVER_TIME)
FROM
   evi_data
 
''').df()

Unnamed: 0,"max(""DISCOVER_TIME"")","min(""DISCOVER_TIME"")"
0,2022-09-30 23:59:59.167,2022-01-02 19:59:47.565


## Group Data

In [10]:
clean_data_df = conn.execute('''  
SELECT 
    DISCOVER_YEAR,
    DISCOVER_MONTH,
    DISCOVER_DAY,
    DISCOVER_HOUR,
    IP_ADDRESS,
    LATITUDE,
    LONGITUDE,
    PAIR_NAME,
    NAME,
    COUNT(REG_NO) AS VEHICLES
FROM
   evi_data
GROUP BY DISCOVER_YEAR, DISCOVER_MONTH, DISCOVER_DAY, DISCOVER_HOUR, 
         IP_ADDRESS, LATITUDE, LONGITUDE, PAIR_NAME, NAME
''').df()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

## Data QA

### Identify and Remove column variables that only have a single value.

In [11]:
print (clean_data_df.shape)

(85409, 10)


In [12]:
counts = clean_data_df.nunique(axis='index')

In [13]:
counts

DISCOVER_YEAR        1
DISCOVER_MONTH       8
DISCOVER_DAY        31
DISCOVER_HOUR       24
IP_ADDRESS          22
LATITUDE            12
LONGITUDE           12
PAIR_NAME           12
NAME                20
VEHICLES          2430
dtype: int64

In [14]:
df = pd.DataFrame({'column_name':counts.index, 'value':counts.values})

In [15]:
single_value_columns= [item['column_name'] for index, item in  df.iterrows() if item['value'] == 1]

In [16]:
single_value_columns

['DISCOVER_YEAR']

In [17]:
clean_data_df =(clean_data_df
          .drop(single_value_columns, axis=1))

In [18]:
print (clean_data_df.shape)

(85409, 9)


### Handling Missing Values (min = 0) in numeric Column

In [19]:
statistics = clean_data_df.describe()
min_value_zero_columns = [item for item in statistics if statistics[item]['min'] == 0]
min_value_zero_columns

['DISCOVER_HOUR']

### Missing Values 

In [20]:
clean_data_df.isna().sum().sum()

0

In [21]:
clean_data_df.stb.missing()

Unnamed: 0,missing,total,percent
DISCOVER_MONTH,0,85409,0.0
DISCOVER_DAY,0,85409,0.0
DISCOVER_HOUR,0,85409,0.0
IP_ADDRESS,0,85409,0.0
LATITUDE,0,85409,0.0
LONGITUDE,0,85409,0.0
PAIR_NAME,0,85409,0.0
NAME,0,85409,0.0
VEHICLES,0,85409,0.0


### Duplicate Row Check

In [22]:
clean_data_df.duplicated().sum()

0

## Save Clean Data

In [23]:
clean_data_df.shape

(85409, 9)

In [None]:
clean_data_df.sample(5)

In [25]:
config["featurize"]["clean_data"]

'data/processed/clean_grouped_data.csv'

In [26]:
clean_data_df.to_csv('../' + config["featurize"]["clean_data"],  index=False) #header=False,