# Data Exploration

This notebook provides an exploration of our two data sources.

In [8]:
# Core packages


# External packages
import numpy as np
import pandas as pd

## Data Cleaning
Overall the dataset is fairly clean as the provider already performs various data cleaning operations to provide consistent trade data. There are 23,884 records in the `sitc_product_code` column with value `ZZ`. We will cast these as `np.nan` so that we can make the column numeric and save the data in `parquet` format to enable faster loading and reduce storage size from 1.8GB to 333MB.

In [10]:
df = pd.read_csv(filepath_or_buffer='/home/ols/Downloads/country_partner_sitcproduct2digit_year.tab',
                 sep='\t',
                 dtype={
                     'location_id': np.int64,
                     'partner_id': np.int64,
                     'product_id': np.int64,
                     'year': np.int64,
                     'export_value': np.int64,
                     'import_value': np.int64,
                     'sitc_eci': np.float64,
                     'sitc_coi': np.float64,
                     'location_code': object,
                     'partner_code': object,
                     'sitc_product_code': object
})

In [26]:
df['sitc_product_code'].unique()

array([33., 78., 74., 93.,  3., 66.,  4., 27., 52., 65., 69., 71., 72.,
       77., 82., 84., 87., 89.,  9., 53., 62., 64., 67., 75.,  5., 85.,
       11., 51., 76., 81.,  7., 55., 56., 59., 79., 54., 88., 26., 68.,
       83., 58.,  1.,  2., 29., 42., 63., 34.,  8., nan,  0.,  6., 12.,
       22., 23., 24., 28., 41., 43., 57., 61., 73., 94., 97., 21., 25.,
       95., 35., 91., 32., 96.])

In [25]:
df.loc[df['sitc_product_code'] == 'ZZ', 'sitc_product_code']

  result = method(y)


Series([], Name: sitc_product_code, dtype: float64)

In [14]:
df.loc[df['sitc_product_code'] == 'ZZ', 'sitc_product_code'] = np.nan

In [29]:
df['sitc_product_code'].isna().sum()

23884

In [22]:
df['sitc_product_code'] = pd.to_numeric(df['sitc_product_code'])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29524294 entries, 0 to 29524293
Data columns (total 11 columns):
location_id          int64
partner_id           int64
product_id           int64
year                 int64
export_value         int64
import_value         int64
sitc_eci             float64
sitc_coi             float64
location_code        object
partner_code         object
sitc_product_code    float64
dtypes: float64(3), int64(6), object(2)
memory usage: 2.4+ GB


In [24]:
df.to_parquet('/home/ols/computers/dend/comtrade.parquet')