In [None]:

# author: Michael Munz
#
# in -> 1.0-munz-data-exploration_locations
# identifying reporting errors
# setting individual col entries to NaN (nbv)
# removing non-breaking space characters
# splitting col :voie into two cols :voie_number, :voie_name
# out <- 1.1-munz-data-exploration_locations

In [13]:
# import
import pandas as pd
import numpy as np
from joblib import dump, load


In [24]:
# ----
# load
# ----
df = load( '../../data/processed/1_exploration/1.0-munz-data-exploration_locations.joblib' )

In [25]:
# identifying reporting errors
# must be handled separate

# col :nbv
# '#ERREUR' -> 1
# 2022 = 1
error = df[ df.nbv == '#ERREUR' ]
display( len(error) )

# col :nbv
# '#VALEURMULTI' -> 104
# 2023 = 54
# 2024 = 50
valeurmulti = df[ df.nbv == '#VALEURMULTI' ]
display( len(valeurmulti) )


1

104

In [26]:
# col :nbv
# strategy
# setting invalid entries to NaN (missing value)
# opt 1 - cleaning only specific error strings
df['nbv'] = df['nbv'].replace( ['#ERREUR', '#VALEURMULTI'], pd.NA )
df['nbv'] = pd.to_numeric( df.nbv, errors='coerce' )


In [17]:
# preview
display( df.nbv.unique() )
display( df.nbv.nunique() )


array([10.,  2.,  8.,  5.,  3.,  4.,  6.,  1.,  7.,  0., -1.,  9., 11.,
       12., nan])

14

In [18]:
# col :pr1
# non-breaking space characters (\xa0) -> 630
all_nonbreaking_values = set()

for col in df.select_dtypes( include='object' ).columns:
    # 1 filter values containing \xa0
    matched_values = df[ col ][ df[col].str.contains( '\xa0', na=False ) ].unique()
    
    # 2 add
    all_nonbreaking_values.update( matched_values )

# display
list_nonbreaking_values = list( all_nonbreaking_values )

display( len(list_nonbreaking_values) )
display( list_nonbreaking_values[:5] )



630

['8\xa0965', '2\xa0593', '2\xa0444', '2\xa0626', '1\xa0821']

In [19]:
# import
import unicodedata

# 1 replace \xa0 with normal space
df['pr1'] = df.pr1.apply( lambda x: x.replace( '\xa0', ' ' ) if isinstance(x, str) else x )

# 2 normalize unicode charcters
df['pr1'] = df.pr1.apply( lambda x: unicodedata.normalize('NFKC', x) if isinstance(x, str) else x )

# 3 strip leading+trailing space
df['pr1'] = df.pr1.apply( lambda x: x.strip() if isinstance(x, str) else x )



In [20]:
# 1 replace \xa0 with normal space
df['voie'] = df.voie.apply( lambda x: x.replace( '\xa0', ' ' ) if isinstance(x, str) else x )

# 2 normalize unicode charcters
df['voie'] = df.voie.apply( lambda x: unicodedata.normalize('NFKC', x) if isinstance(x, str) else x )

# 3 strip leading+trailing space
df['voie'] = df.voie.apply( lambda x: x.strip() if isinstance(x, str) else x )
 

In [21]:
# 1 replace \xa0 with normal space
df['pr'] = df.pr.apply( lambda x: x.replace( '\xa0', ' ' ) if isinstance(x, str) else x )

# 2 normalize unicode charcters
df['pr'] = df.pr.apply( lambda x: unicodedata.normalize('NFKC', x) if isinstance(x, str) else x )

# 3 strip leading+trailing space
df['pr'] = df.pr.apply( lambda x: x.strip() if isinstance(x, str) else x )
 

In [22]:
# verify
# non-breaking space
nbs_pr1 = df.pr1[ df.pr1.str.contains( '\xa0', na=False ) ].nunique()
nbs_voie = df.voie[ df.voie.str.contains( '\xa0', na=False ) ].nunique()
nbs_pr = df.pr[ df.pr.str.contains( '\xa0', na=False ) ].nunique()


display( nbs_pr1 )
display( nbs_voie )
display( nbs_pr )


0

0

0

In [23]:
# ----
# save
# ----
dump(
    df, 
    '../../data/processed/1_exploration/1.1-munz-data-exploration_locations.joblib'
)

['../../data/processed/1_exploration/1.1-munz-data-exploration_locations.joblib']