# Clean Data As csv

## Load Libraries

In [1]:
import os
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np
import seaborn as sns
import sidetable as stb
import pprint
import yaml

## Read config

In [2]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

In [3]:
pprint.pprint(config["featurize"]["clean_joined_data"])

'data/processed/clean_join_data_parquet'


In [4]:
pprint.pprint(config["featurize"]["clean_data"])

'data/processed/clean_grouped_data.csv'


## Load parquet in DuckDB

In [5]:
import duckdb  
conn = duckdb.connect(':memory:')

In [6]:
parquet_file = '../' + config["featurize"]["clean_joined_data"]
conn.sql('CREATE VIEW evi_data AS SELECT * FROM read_parquet(\'' + parquet_file + '\')')

In [7]:
conn.execute('''  
SELECT 
    COUNT(*)
FROM
   evi_data
    
''').df()

Unnamed: 0,count_star()
0,4388248


In [None]:
conn.execute('''  
SELECT 
    *
FROM
   evi_data
LIMIT 5   
''').df()

In [None]:
conn.execute('''  
SELECT 
    MAX(DISCOVER_TIME),
    MIN(DISCOVER_TIME)
FROM
   evi_data
 
''').df()

## Group Data

In [None]:
clean_data_df = conn.execute('''  
SELECT 
    DISCOVER_YEAR,
    DISCOVER_MONTH,
    DISCOVER_DAY,
    DISCOVER_HOUR,
    IP_ADDRESS,
    LATITUDE,
    LONGITUDE,
    PAIR_NAME,
    NAME,
    COUNT(REG_NO) AS VEHICLES
FROM
   evi_data
GROUP BY DISCOVER_YEAR, DISCOVER_MONTH, DISCOVER_DAY, DISCOVER_HOUR, 
         IP_ADDRESS, LATITUDE, LONGITUDE, PAIR_NAME, NAME
''').df()

## Data QA

### Identify and Remove column variables that only have a single value.

In [None]:
print (clean_data_df.shape)

In [None]:
counts = clean_data_df.nunique(axis='index')

In [None]:
counts

In [None]:
df = pd.DataFrame({'column_name':counts.index, 'value':counts.values})

In [None]:
single_value_columns= [item['column_name'] for index, item in  df.iterrows() if item['value'] == 1]

In [None]:
single_value_columns

In [None]:
clean_data_df =(clean_data_df
          .drop(single_value_columns, axis=1))

In [None]:
print (clean_data_df.shape)

### Handling Missing Values (min = 0) in numeric Column

In [None]:
statistics = clean_data_df.describe()
min_value_zero_columns = [item for item in statistics if statistics[item]['min'] == 0]
min_value_zero_columns

### Missing Values 

In [None]:
clean_data_df.isna().sum().sum()

In [None]:
clean_data_df.stb.missing()

### Duplicate Row Check

In [None]:
clean_data_df.duplicated().sum()

## Save Clean Data

In [None]:
clean_data_df.shape

In [None]:
clean_data_df.sample(5)

In [None]:
config["featurize"]["clean_data"]

In [None]:
clean_data_df.to_csv('../' + config["featurize"]["clean_data"],  index=False) #header=False,