This notebook retraces the data treatment steps that were performed to refine the original trade data from the UN COMTRADE database. This notebook only serves as documentation and odes not have to be run by users.

In [1]:
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np

In [2]:
conn = sqlite3.connect('C://Users/11max/PycharmProjects/Regioinvent/new_trade_data.db')
conn2 = sqlite3.connect('C://Users/11max/PycharmProjects/Regioinvent/trade_data_treated.db')

In [3]:
data = pd.read_sql('SELECT * FROM [Import data]', conn2)

In [4]:
# drop data where both qty and qtyAlt are N/A in units
data = data.drop(data.loc[data.qtyUnitCode == -1].loc[data.altQtyUnitCode == -1].index)
data.loc[:,'usedqty'] = [data.qty[i] if data.qtyUnitCode[i] != -1 else data.altQty[i] for i in data.index]
data.loc[:,'usedqtyUnitCode'] = [data.qtyUnitCode[i] if data.qtyUnitCode[i] != -1 else data.altQtyUnitCode[i] for i in data.index]

In [5]:
cmds = set(data.cmdCode)
unit_issues = []

for cmd in tqdm(cmds, leave=True):
    df = data.loc[data.cmdCode == cmd]
    if len(set(df.usedqtyUnitCode)) != 1:
        unit_issues.append(cmd)

100%|████████████████████████████████████████████████████████████████████████████████| 876/876 [07:10<00:00,  2.03it/s]


In [6]:
unit_issues

['250100',
 '293090',
 '441194',
 '281420',
 '720211',
 '840140',
 '290250',
 '284011',
 '860110',
 '842139',
 '853223',
 '271129',
 '1801',
 '852842',
 '390110',
 '760529',
 '7005',
 '7802',
 '850490',
 '280120',
 '310221',
 '290220',
 '852852',
 '292145',
 '283630',
 '2504',
 '760120',
 '380899',
 '271114',
 '2519',
 '854150',
 '290420',
 '291619',
 '340213',
 '440729',
 '260700',
 '310551',
 '870130',
 '680299',
 '854110',
 '321519',
 '284161',
 '710813',
 '291512',
 '250700',
 '701690',
 '850690',
 '271012',
 '291829',
 '940690',
 '280490',
 '846781',
 '060220',
 '290376',
 '120999',
 '480810',
 '291422',
 '8415',
 '310260',
 '270400',
 '251710',
 '730459',
 '252910',
 '291211',
 '841891',
 '854290',
 '853310',
 '850650',
 '292519',
 '392590',
 '844399',
 '290410',
 '271112',
 '8001',
 '282590',
 '190190',
 '360200',
 '690510',
 '290531',
 '390210',
 '500720',
 '282510',
 '310311',
 '841620',
 '271290',
 '761010',
 '690290',
 '100590',
 '290339',
 '080119',
 '842121',
 '340220',
 '

In [7]:
unit_codes = dict(zip(data.qtyUnitCode, data.qtyUnitAbbr))
unit_codes.update(dict(zip(data.altQtyUnitCode, data.altQtyUnitAbbr)))

### Manage units

In [9]:
# change unit 21 "1000 KG" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 21, 'usedqty'] *= 1000
data.loc[data.usedqtyUnitCode == 21, 'usedqtyUnitCode'] = 8
# change unit 9 "1000u" to unit 5 "u"
data.loc[data.usedqtyUnitCode == 9, 'usedqty'] *= 1000
data.loc[data.usedqtyUnitCode == 9, 'usedqtyUnitCode'] = 5
# change unit 17 "1000 m3" to unit 12 "m3"
data.loc[data.usedqtyUnitCode == 17, 'usedqty'] *= 1000
data.loc[data.usedqtyUnitCode == 17, 'usedqtyUnitCode'] = 12
# change unit 7 "l" to unit 12 "m3"
data.loc[data.usedqtyUnitCode == 7, 'usedqty'] /= 1000
data.loc[data.usedqtyUnitCode == 7, 'usedqtyUnitCode'] = 12
# change unit 40 "GT" (Goss tonnage) to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 40, 'usedqtyUnitCode'] = 8
# delete data with unit 4 "m" -> only bad entries
data = data.drop(data.loc[data.usedqtyUnitCode == 4].index)
# delete data with unit 10 "U (jeu/pack)" -> don't know what to do with that
data = data.drop(data.loc[data.usedqtyUnitCode == 10].index)
# change unit 15 "g" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 15, 'usedqty'] /= 1000
data.loc[data.usedqtyUnitCode == 15, 'usedqtyUnitCode'] = 8
# change unit 19 "BBL" (barrel) to unit 12 "m3"
data.loc[data.usedqtyUnitCode == 19, 'usedqty'] *= 0.159
data.loc[data.usedqtyUnitCode == 19, 'usedqtyUnitCode'] = 12
# change unit 6 "2u" to unit 5 "u"
data.loc[data.usedqtyUnitCode == 6, 'usedqty'] *= 2
data.loc[data.usedqtyUnitCode == 6, 'usedqtyUnitCode'] = 5
# change unit 11 "12u" to unit 5 "u"
data.loc[data.usedqtyUnitCode == 11, 'usedqty'] *= 12
data.loc[data.usedqtyUnitCode == 11, 'usedqtyUnitCode'] = 5
# change unit 24 "head" to unit 5 "u"
data.loc[data.usedqtyUnitCode == 24, 'usedqtyUnitCode'] = 5
# change unit 27 "kg P2O5" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 27, 'usedqtyUnitCode'] = 8
# change unit 28 "kg H2O2" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 28, 'usedqtyUnitCode'] = 8
# change unit 29 "kg met.am." to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 29, 'usedqtyUnitCode'] = 8
# change unit 30 "kg N" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 30, 'usedqtyUnitCode'] = 8
# change unit 31 "kg KOH" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 31, 'usedqtyUnitCode'] = 8
# change unit 32 "kg K2O" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 32, 'usedqtyUnitCode'] = 8
# change unit 33 "kg NaOH" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 33, 'usedqtyUnitCode'] = 8
# change unit 34 "kg 90% sdt" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 34, 'usedqtyUnitCode'] = 8
# change unit 35 "kg U" to unit 8 "kg"
data.loc[data.usedqtyUnitCode == 35, 'usedqtyUnitCode'] = 8

In [10]:
# delete specific data point of glass measured in kg instead of m2
data = data.drop(data.loc[data.cmdCode == '7005'].loc[data.usedqtyUnitCode == 8].index)
# delete specific data point of natural gas liquefied measured in TJ
data = data.drop(data.loc[data.cmdCode == '271111'].loc[data.usedqtyUnitCode == 18].index)

In [11]:
# convert TJ of natural gas gaseous to kg (1000000 to MJ and 47kg/m3 for NG LHV)
df = data.loc[data.cmdCode == '271121'].copy()
df.loc[:,'usedqty'] *= 1000000/47
df.loc[:,'usedqtyUnitCode'] = 8
data = data.drop(df.index)
data = pd.concat([data, df.dropna(subset=['usedqty'])])

In [12]:
# change unit 41 "ce/el" into unit 8 "kg" using netWgt or calculated average weight

ceel_users = set(data.loc[data.usedqtyUnitCode == 41, 'cmdCode'])

for cmd in ceel_users:
    df = data.loc[data.cmdCode == cmd].copy()
    # if there are no NaNs in netWgt, just use netWgt
    no_nans = df.loc[df.usedqtyUnitCode == 41].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
    if not no_nans.empty:
        df.loc[no_nans, 'usedqty'] = df.loc[no_nans, 'netWgt']
        df.loc[no_nans, 'usedqtyUnitCode'] = 8
    # otherwise use average weight
    nans = pd.concat([df.loc[df.usedqtyUnitCode == 41].loc[df.netWgt.isna()], 
                      df.loc[df.usedqtyUnitCode == 41].loc[df.netWgt == 0]]).index
    if not nans.empty:
        average_weight = (df.loc[df.usedqtyUnitCode == 41, 'usedqty'] / 
                          df.loc[df.usedqtyUnitCode == 41, 'netWgt'] * 
                          df.loc[df.usedqtyUnitCode == 41, 'usedqty'] / 
                          df.loc[df.usedqtyUnitCode == 41, 'usedqty'].replace(np.inf, np.nan).sum()).replace(np.inf, np.nan).dropna().sum()
        df.loc[nans, 'usedqty'] *= average_weight
        df.loc[nans, 'usedqtyUnitCode'] = 8

    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

In [13]:
GRT_users = set(data.loc[data.usedqtyUnitCode == 39, 'cmdCode'])

for cmd in GRT_users:
    df = data.loc[data.cmdCode == cmd].copy()
    df.loc[df.usedqtyUnitCode == 39, 'usedqty'] = df.loc[df.usedqtyUnitCode == 39, 'netWgt']
    df.loc[df.usedqtyUnitCode == 39, 'usedqtyUnitCode'] = 8
    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

ct_l_users = set(data.loc[data.usedqtyUnitCode == 36, 'cmdCode'])

for cmd in ct_l_users:
    df = data.loc[data.cmdCode == cmd].copy()
    df.loc[df.usedqtyUnitCode == 36, 'usedqty'] = df.loc[df.usedqtyUnitCode == 36, 'netWgt']
    df.loc[df.usedqtyUnitCode == 36, 'usedqtyUnitCode'] = 8
    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

gi_FS_users = set(data.loc[data.usedqtyUnitCode == 38, 'cmdCode'])

for cmd in gi_FS_users:
    df = data.loc[data.cmdCode == cmd].copy()
    non_zero = df.loc[df.usedqtyUnitCode == 38].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
    average_weight = (df.loc[non_zero, 'usedqty'] / 
                      df.loc[non_zero, 'netWgt'] * 
                      df.loc[non_zero, 'usedqty'] / 
                      df.loc[non_zero, 'usedqty'].sum()).sum()
    if non_zero.empty:
        non_zero = df.loc[df.altQtyUnitCode == 38].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
        average_weight = (df.loc[non_zero, 'altQty'] / 
                          df.loc[non_zero, 'netWgt'] * 
                          df.loc[non_zero, 'altQty'] / 
                          df.loc[non_zero, 'altQty'].sum()).sum()
    
    no_nans = df.loc[df.usedqtyUnitCode == 38].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
    if not no_nans.empty:
        df.loc[no_nans, 'usedqty'] = df.loc[no_nans, 'netWgt']
        df.loc[no_nans, 'usedqtyUnitCode'] = 8
    nans = pd.concat([df.loc[df.usedqtyUnitCode == 38].loc[df.netWgt.isna()], 
                      df.loc[df.usedqtyUnitCode == 38].loc[df.netWgt == 0]]).index
    df.loc[nans, 'usedqty'] /= average_weight
    df.loc[nans, 'usedqtyUnitCode'] = 8
    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

In [14]:
square_meter_users = set(data.loc[data.usedqtyUnitCode == 2, 'cmdCode'])
del_square_meters = []
for cmd in square_meter_users:
    df = data.loc[data.cmdCode == cmd].copy()
    # if less than 10 data points -> probably typo
    if len(df[df.usedqtyUnitCode == 2]) < 10:
        del_square_meters.append(cmd)
data = data.drop([i for i in data.index if data.loc[i,'cmdCode'] in del_square_meters and data.loc[i,'usedqtyUnitCode'] == 2])

square_meter_weights = {
    '500720': 0.1, # kg/m2 of silk
    '531010': 0.5, # kg/m2 of jute
    '531100': 0.2, # kg/m2 of paper yarn
    '680919': 1.5, # kg/m2 of plaster
    }

for cmd in square_meter_weights.keys():
    df = data.loc[data.cmdCode == cmd].copy()
    df.loc[df.usedqtyUnitCode == 2, 'usedqty'] *= square_meter_weights[cmd]
    df.loc[df.usedqtyUnitCode == 2, 'usedqtyUnitCode'] = 8
    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

In [15]:
no_weight = []

for cmd in unit_issues:
    df = data.loc[data.cmdCode == cmd].copy()
    if set(df.loc[:, 'usedqtyUnitCode']) == set([5, 8]):
        non_zero = df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
        average_weight = (df.loc[non_zero, 'usedqty'] / 
                          df.loc[non_zero, 'netWgt'] * 
                          df.loc[non_zero, 'usedqty'] / 
                          df.loc[non_zero, 'usedqty'].sum()).sum()
        if non_zero.empty:
            non_zero = df.loc[df.altQtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
            average_weight = (df.loc[non_zero, 'altQty'] / 
                              df.loc[non_zero, 'netWgt'] * 
                              df.loc[non_zero, 'altQty'] / 
                              df.loc[non_zero, 'altQty'].sum()).sum()
        if non_zero.empty:
                no_weight.append(cmd)
                continue

        using_kg = len(df[df.usedqtyUnitCode == 8])
        using_u = len(df[df.usedqtyUnitCode == 5])
        if using_kg / (using_kg + using_u) < 0.1:
            df.loc[df.usedqtyUnitCode == 8, 'usedqty'] *= average_weight
            df.loc[df.usedqtyUnitCode == 8, 'usedqtyUnitCode'] = 5

        else:
            no_nans = df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
            if not no_nans.empty:
                df.loc[no_nans, 'usedqty'] = df.loc[no_nans, 'netWgt']
                df.loc[no_nans, 'usedqtyUnitCode'] = 8
            nans = pd.concat([df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt.isna()], 
                              df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt == 0]]).index
            df.loc[nans, 'usedqty'] /= average_weight
            df.loc[nans, 'usedqtyUnitCode'] = 8

        data = data.drop(df.index)
        # drop if netWgt was not defined neither
        data = pd.concat([data, df.dropna(subset=['usedqty'])])

In [16]:
# drop those which do not have weight information at all
data = data.drop([i for i in data.index if data.loc[i,'cmdCode'] in no_weight and data.loc[i,'usedqtyUnitCode'] == 5])

In [17]:
cubic_meter_guys = set(data.loc[data.usedqtyUnitCode == 12, 'cmdCode'])

for cmd in cubic_meter_guys:
    df = data.loc[data.cmdCode == cmd].copy()
    using_kg = len(df[df.usedqtyUnitCode == 8])/len(df)
    using_m3 = len(df[df.usedqtyUnitCode == 12])/len(df)
    using_m2 = len(df[df.usedqtyUnitCode == 2])/len(df)
    using_u = len(df[df.usedqtyUnitCode == 5])/len(df)
    if using_m2 < 0.01:
        # consider it's a typo and should be m3
        df.loc[df.usedqtyUnitCode == 2, 'usedqtyUnitCode'] = 12
    if using_u < 0.01:
        non_zero = df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
        average_weight = (df.loc[non_zero, 'usedqty'] / 
                          df.loc[non_zero, 'netWgt'] * 
                          df.loc[non_zero, 'usedqty'] / 
                          df.loc[non_zero, 'usedqty'].sum()).sum()
        if non_zero.empty:
            non_zero = df.loc[df.altQtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
            average_weight = (df.loc[non_zero, 'altQty'] / 
                              df.loc[non_zero, 'netWgt'] * 
                              df.loc[non_zero, 'altQty'] / 
                              df.loc[non_zero, 'altQty'].sum()).sum()
        no_nans = df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt != 0].loc[~df.netWgt.isna()].index
        if not no_nans.empty:
            df.loc[no_nans, 'usedqty'] = df.loc[no_nans, 'netWgt']
            df.loc[no_nans, 'usedqtyUnitCode'] = 8
        nans = pd.concat([df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt.isna()], 
                          df.loc[df.usedqtyUnitCode == 5].loc[df.netWgt == 0]]).index
        df.loc[nans, 'usedqty'] /= average_weight
        df.loc[nans, 'usedqtyUnitCode'] = 8
    if using_m3 < 0.01:
        df.loc[df.usedqtyUnitCode == 12, 'usedqty'] = 0
        if using_kg > 0.5:
            df.loc[df.usedqtyUnitCode == 12, 'usedqtyUnitCode'] = 8
        if using_u > 0.5:
            df.loc[df.usedqtyUnitCode == 12, 'usedqtyUnitCode'] = 5

    data = data.drop(df.index)
    # drop if netWgt was not defined neither
    data = pd.concat([data, df.dropna(subset=['usedqty'])])

cubic_meter_guys = set(data.loc[data.usedqtyUnitCode == 12, 'cmdCode'])

densities = {
    '280410':0.08375, # kg/m3 density of hydrogen
    '4412':700, # kg/m3 density of wood
    '280440':1.429, # kg/m3 density of oxygen
    '280429':0.166, # kg/m3 density of helium
    '4403':700, # kg/m3 density of wood
    '440325':700, # kg/m3 density of wood
    '440341':700, # kg/m3 density of wood
    '4407':700, # kg/m3 density of wood
    '440729':700, # kg/m3 density of wood
    '2804':0.08375, # kg/m3 density of hydrogen
    '440719':700, # kg/m3 density of wood
    '220299':1000, # kg/m3 density of beverage
    '280430':1.2506, # kg/m3 density of nitrogen
    '271129':1.879, # kg/m3 density of propane (proxy for petroleum gases)
    '280421':1.784, # kg/m3 density of argon
    '270500':0.58, # kg/m3 density of coal gas
    '271112':0.498, # kg/m3 density of propane
}

for cmd in cubic_meter_guys:
    df = data[data.cmdCode == cmd]
    if set(df.usedqtyUnitCode) == set([8, 12]):
        using_kg = len(df[df.usedqtyUnitCode == 8])/len(df)
        using_m3 = len(df[df.usedqtyUnitCode == 12])/len(df)
        if using_kg < 0.5:
            df.loc[df.usedqtyUnitCode == 8, 'usedqty'] /= densities[cmd]
            df.loc[df.usedqtyUnitCode == 8, 'usedqtyUnitCode'] = 12
        else:
            df.loc[df.usedqtyUnitCode == 12, 'usedqty'] *= densities[cmd]
            df.loc[df.usedqtyUnitCode == 12, 'usedqtyUnitCode'] = 8

        data = data.drop(df.index)
        # drop if netWgt was not defined neither
        data = pd.concat([data, df.dropna(subset=['usedqty'])])

## Straight up mistakes from import data

In [18]:
# they probably mutiplied by 1000, sort of matches with ither years
data.loc[data.loc[data.cmdCode == '482390'].loc[data.refYear == 2021].loc[data.reporterISO == 'MEX'].loc[
data.partnerISO.isin(['CHN','W00'])].index, 'usedqty'] /= 1000

In [19]:
# ratio comes from exchange of MOZ and ZAF in 2022, that's the only qty defined, altqty are obviously wrong
ratio = 2.475609e+11/2.764464e+07
data.loc[data.loc[data.cmdCode == '070310'].loc[data.reporterISO == 'MOZ'].index, 'usedqty'] /= ratio

In [20]:
# straight up inverted "u" with "kg", just delete whole year
data = data.drop(data.loc[data.cmdCode == '854231'].loc[data.reporterISO == 'THA'].loc[data.refYear == 2022].index)

In [21]:
# reported quantity is obviously wrong, use altQty
data.loc[data.loc[data.cmdCode == '290331'].loc[data.reporterISO == 'GHA'].loc[data.refYear == 2023].loc[data.partnerISO == 'IND'].index, 'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '290331'].loc[data.reporterISO == 'GHA'].loc[data.refYear == 2023].loc[data.partnerISO == 'IND'].index, 'altQty'])

In [22]:
# straight up delusional values for almost all years of MOZ paper trade
data = data.drop(data.loc[data.cmdCode == '854231'].loc[data.reporterISO == 'MOZ'].loc[data.refYear != 2023].index)

In [5]:
data = data.drop(data.loc[data.cmdCode == '481910'].loc[data.reporterISO == 'MOZ'].loc[data.refYear != 2023].index)

In [23]:
# data from the US was at ~e8 in 2019 then jump to ~e10 next years, 100 times above every other country
data = data.drop(data.loc[data.cmdCode == '280440'].loc[data.reporterISO == 'USA'].loc[data.refYear != 2019].index)

In [24]:
# obviously wrong
data = data.drop(data.loc[data.cmdCode.isin(['854129','854110'])].loc[data.reporterISO == 'THA'].loc[data.refYear == 2022].index)
data = data.drop(data.loc[data.cmdCode == '850450'].loc[data.reporterISO == 'THA'].loc[data.refYear != 2019].index)

In [25]:
# average weight did not work for this product, assume a car weights about 1 tonne
data.loc[data.loc[data.cmdCode == '870331'].loc[data.qty == 0].loc[data.altQtyUnitCode == 8].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '870331'].loc[data.qty == 0].loc[data.altQtyUnitCode == 8].index,'altQty'] / 1000)

In [26]:
# obviously wrong
data = data.drop(data.loc[data.cmdCode == '4407'].loc[data.reporterISO == 'YEM'].loc[data.refYear != 2019].index)

In [27]:
# the data from ZAF would imply a density of wood of less than 1kg/m3 -> all wrong
data = data.drop(data.loc[data.cmdCode == '440729'].loc[data.reporterISO == 'ZAF'].loc[data.refYear.isin([2020, 2022, 2023])].index)
data = data.drop(data.loc[data.cmdCode == '4407'].loc[data.reporterISO == 'ZAF'].index)

In [28]:
# US data in m3 on Nitrogen is completely wrong -> use netWgt/1.2 (density of nitrogen)
data.loc[data.loc[data.cmdCode == '280430'].loc[data.reporterISO == 'USA'].index, 'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280430'].loc[data.reporterISO == 'USA'].index, 'netWgt']) / 1.2

In [30]:
# all data after 2021 look suspicious
data = data.drop(data.loc[data.cmdCode == '810720'].loc[data.refYear.isin([2022, 2023])].index)

In [31]:
# US data in m3 on Hydrogen is mostly wrong -> use netWgt/0.08 (density of hydrogen)
data.loc[data.loc[data.cmdCode == '2804'].loc[data.reporterISO == 'USA'].index, 'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '2804'].loc[data.reporterISO == 'USA'].index, 'netWgt']) / 0.08

In [32]:
# US data in m3 on Argon is mostly wrong -> use netWgt/1.78 (density of argon)
data.loc[data.loc[data.cmdCode == '280421'].loc[data.reporterISO == 'USA'].index, 'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280421'].loc[data.reporterISO == 'USA'].index, 'netWgt']) / 1.78

In [33]:
# obviously wrong
data = data.drop(data.loc[data.cmdCode == '841181'].loc[data.refYear == 2023].loc[data.reporterISO == 'PHL'].index)
data = data.drop(data.loc[data.cmdCode == '841181'].loc[data.refYear == 2022].loc[data.reporterISO == 'CZE'].index)

In [34]:
# obviously wrong
data = data.drop(data.loc[data.cmdCode == '271121'].loc[data.reporterISO == 'MEX'].loc[data.refYear.isin([2021,2022,2023])].index)

In [4]:
data = data.drop(data.loc[data.cmdCode == '271129'].loc[data.refYear.isin([2020,2021])].loc[data.reporterISO == 'USA'].index)

## Straight up mistakes from export data

In [50]:
# US trade data of gases is all wrong
data.loc[data.loc[data.cmdCode == '280440'].loc[data.reporterISO == 'USA'].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280440'].loc[data.reporterISO == 'USA'].index,'netWgt'] / 1.429)

data.loc[data.loc[data.cmdCode == '280430'].loc[data.reporterISO == 'USA'].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280430'].loc[data.reporterISO == 'USA'].index,'netWgt'] / 1.2506)

data.loc[data.loc[data.cmdCode == '280421'].loc[data.reporterISO == 'USA'].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280421'].loc[data.reporterISO == 'USA'].index,'netWgt'] / 1.784)

data.loc[data.loc[data.cmdCode == '280429'].loc[data.reporterISO == 'USA'].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '280429'].loc[data.reporterISO == 'USA'].index,'netWgt'] / 0.166)

data = data.drop(data.loc[data.cmdCode == '280410'].loc[data.reporterISO == 'USA'].loc[data.refYear != 2023].index)

In [52]:
data.loc[data.loc[data.cmdCode == '440729'].loc[data.reporterISO == 'ZAF'].loc[
data.partnerISO.isin(['W00','TZA'])].loc[data.refYear==2023].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '440729'].loc[data.reporterISO == 'ZAF'].loc[
    data.partnerISO.isin(['W00','TZA'])].loc[data.refYear==2023].index,'netWgt'] / 700)

In [54]:
data = data.drop(data.loc[data.cmdCode == '8606'].loc[data.reporterISO == 'SAU'].loc[data.refYear == 2020].index)
data = data.drop(data.loc[data.cmdCode == '831190'].loc[data.reporterISO == 'MEX'].loc[data.refYear == 2021].index)
data = data.drop(data.loc[data.cmdCode.isin(['854110','854231','854121','854129'])].loc[data.reporterISO == 'THA'].loc[data.refYear == 2022].index)
data = data.drop(data.loc[data.cmdCode == '711011'].loc[data.reporterISO == 'ZWE'].loc[data.refYear == 2021].index)
data = data.drop(data.loc[data.cmdCode == '381010'].loc[data.reporterISO == 'MEX'].loc[data.refYear == 2022].index)

In [56]:
# recalculated the value of W00
data.loc[data.loc[data.cmdCode == '831190'].loc[data.reporterISO == 'MEX'].loc[data.partnerISO == 'W00'].loc[data.refYear==2022].index,'usedqty'] = 651154.687

In [59]:
data.loc[data.loc[data.cmdCode == '260400'].loc[data.reporterISO == 'ZWE'].loc[data.refYear.isin([2020, 2021])].index,'usedqty'] = (
    data.loc[data.loc[data.cmdCode == '260400'].loc[data.reporterISO == 'ZWE'].loc[data.refYear.isin([2020, 2021])].index,'netWgt'])

In [61]:
# this thing (helicopter trade) is a disgusting mess
data = data.drop(data.loc[data.cmdCode == '880212'].loc[data.reporterISO.isin(['ARE','BOL','AUS'])].loc[data.refYear == 2019].index)
data = data.drop(data.loc[data.cmdCode == '880212'].loc[data.reporterISO.isin(['LBN','LKA','BOL','GAB','AUS','CHN'])].loc[data.refYear == 2020].index)
data = data.drop(data.loc[data.cmdCode == '880212'].loc[data.reporterISO.isin(['BOL'])].loc[data.refYear == 2021].index)
data = data.drop(data.loc[data.cmdCode == '880212'].loc[data.reporterISO.isin(['THA'])].loc[data.refYear == 2022].index)
data = data.drop(data.loc[data.cmdCode == '880212'].loc[data.reporterISO.isin(['IND'])].loc[data.refYear == 2023].index)

# another mess (locomotives)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['RUS','AUT','DNK','ZAF'])].loc[data.refYear == 2019].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['KWT','ZAF'])].loc[data.refYear == 2020].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['GBR','ZAF'])].loc[data.refYear == 2021].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['KOR','JPN','CHL','GBR','ZAF','SVK'])].loc[data.refYear == 2022].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['GBR','ZAF','AUS','ITA','SVK'])].loc[data.refYear == 2023].index)

In [62]:
data = data.drop(data.loc[data.cmdCode == '843920'].loc[data.reporterISO.isin(['ITA','AUS'])].loc[data.refYear == 2019].index)
data = data.drop(data.loc[data.cmdCode == '843920'].loc[data.reporterISO.isin(['SAU','AUS'])].loc[data.refYear == 2020].index)
data = data.drop(data.loc[data.cmdCode == '8506'].loc[data.reporterISO.isin(['JOR','PER'])].loc[data.refYear.isin([2019, 2020])].index)
data = data.drop(data.loc[data.cmdCode == '270500'].loc[data.reporterISO.isin(['USA'])].loc[data.refYear.isin([2022, 2023])].index)

In [63]:
data.loc[data.loc[data.cmdCode.isin(['230230','080132','071333'])].loc[data.reporterISO.isin(['MOZ'])].loc[
data.refYear.isin([2019,2020,2021,2022])].index,'usedqty'] /= 1000

In [49]:
data.drop(data.loc[data.cmdCode == '840734'].loc[data.reporterISO == 'MYS'].loc[data.partnerISO == 'JPN'].loc[data.refYear == 2023].index)
data.drop(data.loc[data.cmdCode == '840734'].loc[data.reporterISO == 'BOL'].loc[data.refYear.isin([2019, 2020, 2021])].index)
data.loc[data.loc[data.cmdCode == '840734'].loc[data.reporterISO == 'MEX'].loc[data.refYear != 2019].index, 'usedqty'] /= 1000

In [52]:
# these data on locomotives are absolute trash, but let's remove the really egrigious ones
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['BEL','AGO'])].loc[data.refYear.isin([2022])].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO.isin(['NGA'])].loc[data.refYear.isin([2023])].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO == 'CHL'].loc[data.refYear.isin([2022, 2023])].index)
data = data.drop(data.loc[data.cmdCode == '860110'].loc[data.reporterISO == 'GBR'].loc[data.refYear.isin([2020, 2021, 2022, 2023])].index)

#### Bunch of W00 entries missing, and that's the useful source of data for exports. So we recalculate the W00 by adding up all other data entry

In [64]:
data_dict = data.groupby(['cmdCode', 'refYear', 'reporterISO']).apply(lambda x: dict(zip(x.partnerISO, x.usedqty))).to_dict()

In [65]:
for combi in data_dict.keys():
    if 'W00' not in data_dict[combi].keys():
        data_dict[combi]['W00'] = sum(data_dict[combi].values())

In [66]:
df = pd.DataFrame([(key[0], key[1], key[2], k, v) for key, value in data_dict.items() for k, v in value.items()], 
                  columns=['cmdCode', 'refYear', 'reporterISO', 'partnerISO', 'usedqty'])

In [54]:
cursor = conn2.cursor()
cursor.execute('DROP TABLE [Import data]')
# gotta remove potential little NaNs and inf lurking around
data.set_index('cmdCode').loc[:,['refYear','reporterISO','partnerISO','usedqty']].replace(np.inf, np.nan).dropna(subset='usedqty').to_sql('Import data', conn2)

7429252