# Cleanng CDDB

In [1]:
import logging
from typing import Dict

import pandas as pd
import pandera as pa

import clean_cddb
from clean_cddb.utils import (display_failure_cases_summary,
                              get_check_func_descriptions)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(process)d - %(levelname)s - %(message)s',
)
filepath = "../data/input/cddb.tsv"
source_df = pd.read_csv(filepath, sep="\t", dtype="str")



## Apply validation checks (pandera schema) and review failure cases

In [2]:
try:
    validated_df = clean_cddb.schema(source_df, lazy=True)
    logging.info("Validation success. No failure cases detected.")
except pa.errors.SchemaErrors as err:
    logging.info("Validation failure. Failure cases detected.")
    logging.debug(err)
    failure_cases_df = err.failure_cases
    
failure_cases_df = failure_cases_df.pipe(get_check_func_descriptions, clean_cddb.schema)

2023-07-28 21:23:03,654 - 13979 - INFO - Validation failure. Failure cases detected.


`failure_cases_df`
* The `failure_cases_df` shows the name of the column, the check, failure case (example), and row index position of the failure case in the original data frame. 
* The index can support bulk operations such as joining and querying the original dataframe for failure cases or rejecting rows in the set of failure case indices.

In [3]:
(failure_cases_df
 .head()
 .loc[:, ['schema_context', 'column', 'check', 'failure_case', 'index']]
 )

Unnamed: 0,schema_context,column,check,failure_case,index
0,Column,artist,not_nullable,,9030
3924,Column,title,Check for invalid symbols.,3rd. Gig,6450
3933,Column,title,Check for invalid symbols.,Transmission Of All The Good-Byes,6482
3932,Column,title,Check for invalid symbols.,"dollÃ¡r, hatalom, pornÃ³",6480
3931,Column,title,Check for invalid symbols.,Mit Liebe...!,6478


### Summary of failure cases

Here we see aggregated counts of the number of failure cases for each validation check.

In [4]:
failure_cases_summary = (
failure_cases_df
.groupby(["column", "check"], as_index=False)
.size()
.sort_values(by=['column', 'check'])
.rename(columns={'size': 'counts'})
)

failure_cases_summary

Unnamed: 0,column,check,counts
0,artist,Check for invalid artist values.,697
1,artist,Check for invalid symbols.,1477
2,artist,not_nullable,1
3,category,Check for invalid categories.,89
4,genre,Check for invalid genres.,1
5,id,Check that the length of 'id' is 6 characters.,477
6,id,dtype('int64'),1
7,title,Check for invalid symbols.,2456
8,title,not_nullable,8
9,tracks,Check for tracks possibly using numeric prefix,599


We also have a helper utility function to display the source code along side each check function name.

In [5]:
# Report summary counts
display_failure_cases_summary(failure_cases_df)

+----------+------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------+
| column   | check                                          |   counts | check_source_code                                                                                                                         |
| artist   | Check for invalid artist values.               |      697 | def check_artist_is_valid(x):                                                                                                             |
|          |                                                |          |     """Check for invalid artist values."""                                                                                                |
|          |                                                |          |                                                                            

# Cleaning step

We can use the same checks from the pandera validation schema to trigger cleaning actions such as:
* do nothing / ignore the value
* transform the value; e.g., replace value with a substitute (e.g., 'N/A')
* or reject the entire record

Here we apply several cleaning functions on the source_df via .pipe(Callable).
* Each function takes a dataframe and returns a dataframe, so we can chain together the cleaning operations like so.
* Later, we will 
  1. compare `source_df` and `clean_df` as a before/after check
  2. re-apply our validation checks (pandera schema) to the new `clean_df` to verify that our transformations improved our data quality

In [6]:
from clean_cddb.cleaning_transforms import (
    clean_df_standardize_various_artists, clean_df_try_to_fix_encoding_errors)

clean_df = (
source_df
.pipe(clean_df_standardize_various_artists)
.pipe(clean_df_try_to_fix_encoding_errors, 'artist')
)

### Sample transformations

Here we can see that we transform "Various Artists" and "<various>" to "Various". We also fixed invalid characters converting text from "JÃ¶rg Hilbert & Felix Janosa" to "Jörg Hilbert & Felix Janosa".

In [7]:
example_idxs = [7629, 1822, 117, 4129]
source_df.compare(clean_df).loc[example_idxs, :]

Unnamed: 0_level_0,artist,artist
Unnamed: 0_level_1,self,other
7629,Various Artists,Various
1822,JÃ¶rg Hilbert & Felix Janosa,Jörg Hilbert & Felix Janosa
117,Various Artists,Various
4129,<various>,Various


In [8]:
source_df.compare(clean_df).sample(20, random_state=0)

Unnamed: 0_level_0,artist,artist
Unnamed: 0_level_1,self,other
4934,Various Artists,Various
2285,Various Artists,Various
6764,Various (Dance),Various
8176,Various Artists,Various
7431,Various Artists,Various
6098,Various Artists,Various
406,Various Artists,Various
1756,Various Artists,Various
9019,Various Artists,Various
3807,Various Artists,Various
