# データ内の欠損値を検索する

In [1]:
import pandas as pd
import glob

In [2]:
# テスト用データ
dtype = {'CCSDS_OMM_VERS': object,  'COMMENT': object,  'CREATION_DATE': 'datetime64[ns]',  'ORIGINATOR': object, 
       'OBJECT_NAME': object,  'OBJECT_ID': object,  'CENTER_NAME': object,  'REF_FRAME': object, 
       'TIME_SYSTEM': object,  'MEAN_ELEMENT_THEORY': object,  'EPOCH': 'datetime64[ns]',  'MEAN_MOTION': 'float64', 
       'ECCENTRICITY': 'float64',  'INCLINATION': 'float64',  'RA_OF_ASC_NODE': 'float64', 
       'ARG_OF_PERICENTER': 'float64',  'MEAN_ANOMALY': 'float64',  'EPHEMERIS_TYPE': 'int8', 
       'CLASSIFICATION_TYPE': object,  'NORAD_CAT_ID': 'uint32',  'ELEMENT_SET_NO': 'uint16', 
       'REV_AT_EPOCH': 'uint32',  'BSTAR': 'float64',  'MEAN_MOTION_DOT': 'float64',  'MEAN_MOTION_DDOT': 'float64', 
       'SEMIMAJOR_AXIS': 'float64',  'PERIOD': 'float64',  'APOAPSIS': 'float64',  'PERIAPSIS': 'float64',  'OBJECT_TYPE': object, 
       'RCS_SIZE': object,  'COUNTRY_CODE': object,  'LAUNCH_DATE': 'datetime64[ns]',  'SITE': object,  'DECAY_DATE': 'datetime64[ns]', 
       'FILE': 'uint64',  'GP_ID': 'uint32',  'TLE_LINE0': object,  'TLE_LINE1': object,  'TLE_LINE2': object}
convert_dates = ['EPOCH', 'CREATION_DATE', 'LAUNCH_DATE', 'DECAY_DATE']
json_null = '[{"CCSDS_OMM_VERS":null,"COMMENT":null,"CREATION_DATE":null,"ORIGINATOR":null,"OBJECT_NAME":null,"OBJECT_ID":null,"CENTER_NAME":null,"REF_FRAME":null,"TIME_SYSTEM":null,"MEAN_ELEMENT_THEORY":null,"EPOCH":null,"MEAN_MOTION":null,"ECCENTRICITY":null,"INCLINATION":null,"RA_OF_ASC_NODE":null,"ARG_OF_PERICENTER":null,"MEAN_ANOMALY":null,"EPHEMERIS_TYPE":null,"CLASSIFICATION_TYPE":null,"NORAD_CAT_ID":null,"ELEMENT_SET_NO":null,"REV_AT_EPOCH":null,"BSTAR":null,"MEAN_MOTION_DOT":null,"MEAN_MOTION_DDOT":null,"SEMIMAJOR_AXIS":null,"PERIOD":null,"APOAPSIS":null,"PERIAPSIS":null,"OBJECT_TYPE":null,"RCS_SIZE":null,"COUNTRY_CODE":null,"LAUNCH_DATE":null,"SITE":null,"DECAY_DATE":null,"FILE":null,"GP_ID":null,"TLE_LINE0":null,"TLE_LINE1":null,"TLE_LINE2":null}]'
df_null = pd.read_json(json_null, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')

In [3]:
dtype_satcat = {'INTLDES': object, 'NORAD_CAT_ID': 'uint32', 'OBJECT_TYPE': object, 'SATNAME': object,
    'COUNTRY': object, 'LAUNCH': 'datetime64[ns]', 'SITE': object, 'DECAY': 'datetime64[ns]',
    'PERIOD': 'float64', 'INCLINATION': 'float64', 'APOGEE': 'uint64', 'PERIGEE': 'uint64',
    'COMMENT': object, 'COMMENTCODE': 'uint8', 'RCSVALUE': 'int32', 'RCS_SIZE': object,
    'FILE': 'uint16', 'LAUNCH_YEAR': 'uint16', 'LAUNCH_NUM': 'uint16', 'LAUNCH_PIECE': object,
    'CURRENT': object, 'OBJECT_NAME': object, 'OBJECT_ID': object, 'OBJECT_NUMBER': 'uint32'}
convert_dates_satcat = ['LAUNCH', 'DECAY']

In [4]:
# DataFrameの中に欠損値を含む列が存在するかどうかを調べる
def testnull(df):
    for column in df.columns:
        typename = df[column].dtypes
        if df[column].isnull().any():
            print('{} is null ({})'.format(column, typename))
        if (df[column] == 'None').any():
            print('{} is string "None" ({})'.format(column, typename))
        if (df[column] == '').any():
            print('{} is zero-length string ({})'.format(column, typename))

In [5]:
# テストデータ
testnull(df_null)

CCSDS_OMM_VERS is null (object)
COMMENT is null (object)
CREATION_DATE is null (datetime64[ns])
ORIGINATOR is null (object)
OBJECT_NAME is null (object)
OBJECT_ID is null (object)
CENTER_NAME is null (object)
REF_FRAME is null (object)
TIME_SYSTEM is null (object)
MEAN_ELEMENT_THEORY is null (object)
EPOCH is null (datetime64[ns])
MEAN_MOTION is null (float64)
ECCENTRICITY is null (float64)
INCLINATION is null (float64)
RA_OF_ASC_NODE is null (float64)
ARG_OF_PERICENTER is null (float64)
MEAN_ANOMALY is null (float64)
EPHEMERIS_TYPE is null (object)
CLASSIFICATION_TYPE is null (object)
NORAD_CAT_ID is null (object)
ELEMENT_SET_NO is null (object)
REV_AT_EPOCH is null (object)
BSTAR is null (float64)
MEAN_MOTION_DOT is null (float64)
MEAN_MOTION_DDOT is null (float64)
SEMIMAJOR_AXIS is null (float64)
PERIOD is null (float64)
APOAPSIS is null (float64)
PERIAPSIS is null (float64)
OBJECT_TYPE is null (object)
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (dat

In [6]:
# APOGEE, PERIGEE, COMMENTCODE 列は None を含むため object 型になっている
satcatfiles = sorted(glob.glob('download/satcat*.json.xz'))
for file in satcatfiles:
    print("===========================================================================")
    print(file)
    df = pd.read_json(file, convert_dates = convert_dates_satcat, dtype = dtype_satcat, precise_float = True, orient = 'records')
    testnull(df)

/work/nishida/tle/download/satcat-20201122150716.json.xz
DECAY is null (datetime64[ns])
PERIOD is null (float64)
INCLINATION is null (float64)
APOGEE is null (object)
PERIGEE is null (object)
COMMENT is null (object)
COMMENTCODE is null (object)
RCS_SIZE is null (object)


In [7]:
elsetfiles = sorted(glob.glob('download/*.parquet'))
for file in elsetfiles:
    print("===========================================================================")
    print(file)
    df = pd.read_parquet(file)
    testnull(df)

download/1959.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1960.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1961.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1962.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1963.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1964.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object

download/1992.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1993.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1994.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1995.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1996.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object)
DECAY_DATE is null (datetime64[ns])
download/1997.parquet
RCS_SIZE is null (object)
COUNTRY_CODE is null (object)
LAUNCH_DATE is null (datetime64[ns])
SITE is null (object