In [2]:
import json
import re
import requests
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from xml.etree import ElementTree

In [3]:
# Full request parameters. Two "all" parameters are allowed per request.
"""
http://wits.worldbank.org/API/V1/SDMX/V21/datasource/TRN/
reporter/{all|reporter code list}
/partner/{all|partner code list}
/product/{ALL|product code list}
/year/{All|Year List
/datatype/{reported|aveestimated}}"""
    
    
    
countries_url = "http://wits.worldbank.org/API/V1/wits/datasource/trn/country/ALL"
nomenclature_url = "http://wits.worldbank.org/API/V1/wits/datasource/trn/nomenclature/" #H0 through H4
products_url = "http://wits.worldbank.org/API/V1/wits/datasource/trn/product/all" # nomenclaturecode is HS

# Country, product, nomenclature, partner list, 
# number preferential agreements, is specific duty expression estimated, last update
data_available_url = "http://wits.worldbank.org/API/V1/wits/datasource/trn/dataavailability/" 

In [4]:
# US 840. Canada 124. EU 918. 710 South Africa
# Make sure partner is taken from Partner list in data availability
test_url = "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/TRN/reporter/918/partner/710/product/ALL/year/2008/datatype/reported"

In [5]:
res = requests.get(test_url)

In [6]:
root = ElementTree.fromstring(res.content)
series = root.findall(".//Series")
obs = root.findall(".//Obs")
dataset = root.findall(".//Dataset")

In [7]:
first = series[0].findall(".//Obs")[0].attrib
attribute = series[0].attrib
combined = {**first,**attribute}
df = pd.DataFrame(columns=list(combined.keys()))

data = [{**s.findall(".//Obs")[0].attrib,**s.attrib}  for s in series]

df = df.append(data)

In [8]:
df.head()

Unnamed: 0,PRODUCTCODE,SUM_OF_RATES,OBS_VALUE_MEASURE,NBR_PREF_LINES,NBR_MFN_LINES,MAX_RATE,OBS_VALUE,NBR_NA_LINES,PARTNER,NOMENCODE,REPORTER,MIN_RATE,DATATYPE,TARIFFTYPE,TIME_PERIOD,FREQ,TOTALNOOFLINES
0,10110,1.20000004768372,SimpleAverage,2,1,1.20000004768372,0.400000015894572,0,710,H3,918,0,Reported,PREF,2008,A,3
1,10190,1.20000004768372,SimpleAverage,3,1,1.20000004768372,0.300000011920929,0,710,H3,918,0,Reported,PREF,2008,A,4
2,10391,0.0,SimpleAverage,1,1,0.0,0.0,1,710,H3,918,0,Reported,PREF,2008,A,2
3,10392,0.0,SimpleAverage,2,1,0.0,0.0,2,710,H3,918,0,Reported,PREF,2008,A,3
4,10410,0.0,SimpleAverage,2,1,0.0,0.0,2,710,H3,918,0,Reported,PREF,2008,A,3


In [9]:
df.describe()

Unnamed: 0,PRODUCTCODE,SUM_OF_RATES,OBS_VALUE_MEASURE,NBR_PREF_LINES,NBR_MFN_LINES,MAX_RATE,OBS_VALUE,NBR_NA_LINES,PARTNER,NOMENCODE,REPORTER,MIN_RATE,DATATYPE,TARIFFTYPE,TIME_PERIOD,FREQ,TOTALNOOFLINES
count,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562,3562
unique,3562,104,1,39,25,65,101,27,1,1,1,44,1,1,1,1,44
top,281420,0,SimpleAverage,1,0,0,0,0,710,H3,918,0,Reported,PREF,2008,A,1
freq,1,3306,3562,1798,2983,3306,3306,3363,3562,3562,3562,3349,3562,3562,3562,3562,1561


In [10]:
df[df.MAX_RATE==df.MIN_RATE].count()

PRODUCTCODE          3489
SUM_OF_RATES         3489
OBS_VALUE_MEASURE    3489
NBR_PREF_LINES       3489
NBR_MFN_LINES        3489
MAX_RATE             3489
OBS_VALUE            3489
NBR_NA_LINES         3489
PARTNER              3489
NOMENCODE            3489
REPORTER             3489
MIN_RATE             3489
DATATYPE             3489
TARIFFTYPE           3489
TIME_PERIOD          3489
FREQ                 3489
TOTALNOOFLINES       3489
dtype: int64

In [11]:
df[df.MAX_RATE=="0"].count()

PRODUCTCODE          3306
SUM_OF_RATES         3306
OBS_VALUE_MEASURE    3306
NBR_PREF_LINES       3306
NBR_MFN_LINES        3306
MAX_RATE             3306
OBS_VALUE            3306
NBR_NA_LINES         3306
PARTNER              3306
NOMENCODE            3306
REPORTER             3306
MIN_RATE             3306
DATATYPE             3306
TARIFFTYPE           3306
TIME_PERIOD          3306
FREQ                 3306
TOTALNOOFLINES       3306
dtype: int64

In [12]:
len(df[df.MAX_RATE==""]) # 96 missing or empty max rates

96

In [13]:
df[['NOMENCODE','PRODUCTCODE']]

Unnamed: 0,NOMENCODE,PRODUCTCODE
0,H3,010110
1,H3,010190
2,H3,010391
3,H3,010392
4,H3,010410
5,H3,010420
6,H3,010511
7,H3,010512
8,H3,010519
9,H3,010594


In [14]:
# WITS UNCTAD TRAINS provides graphical query interface with API-like url structure
gui_url = "http://wits.worldbank.org/tariff/trains/en/country/USA/year/2014/pagenumber/1/pageSize/134684"
res = requests.get(gui_url)

In [15]:
soup = BeautifulSoup(res.text,"html.parser")

In [16]:
scripts = soup.find_all("script")
len(scripts)

19

In [17]:

p = re.search("(localdata: \[)(.*)",scripts[7].text)
data = p.group(2)

In [18]:
fields = ['ProductCode','ProductDescription','Partner','PartnerName','AdValorem','MeasureName','NonAdValorem','AffectedPartners']
rows = [s+"," for s in data.split("},")]


output = []
for row in rows:
    r = {}
    for f in fields:
        s = re.search("(?<="+f+" :)(.*?)(?=,)",row)
        value = s.group(0).replace("\"","")
        #print(f+":"+value)
        r.update({f:value})
    output.append(r)
print(r)

{'PartnerName': ' Non-MFN Countries for USA: 2006  ', 'Partner': ' N76  ', 'ProductDescription': ' Antiques of an age exceeding one hundred years  ', 'AffectedPartners': '   }\r', 'NonAdValorem': '   ', 'ProductCode': ' 97060000  ', 'AdValorem': ' 0.00  ', 'MeasureName': ' Non-MFN duty rate for countries excluded from Most-Favoured- Nation (MFN) treatement  '}


In [19]:
f = "ProductCode"
s = re.search("(?<="+f+" :)(.*?)(?=,)",rows[0])
value = s.group(0).replace("\"","")
value

' 01012100  '

In [20]:
p = re.search('var totalRecords= ([0-9]{1,6})',scripts[7].text)
p.group(0)

'var totalRecords= 134684'

In [21]:
dat = pd.DataFrame(output)


In [23]:
for a in dat.NonAdValorem.unique():
    pass
    #print(a)

In [45]:
nav_unique = dat.NonAdValorem.unique()
units = """cent_tariff: {<CD><NN>}
           cent_tariff: {<CD><JJ><NN>}
           cent_tariff: {<CD><NNS><DT>}
           dollar_tariff:{<$>?<CD>}
           content:{<IN><DT><NN>}
           content:{<IN><NN><NN>}
           content:{<IN><JJ><NNS>}
           """ # 6.6 cents/kg
percent = "price:{<$>?<CD>}" # $ 1.64/kg
content = "<IN<NN><NN>" #on molybdenum content
extra_feature = "<IN><DT><NN>" # on the case

# value currency_unit weight_unit 
cp = nltk.RegexpParser(units)
for n in nav_unique:
    pos = nltk.pos_tag(nltk.word_tokenize(n))
    print(cp.parse(pos))
    
    

(S )
(S (cent_tariff 6.6/CD cents/kg/NN))
(S (cent_tariff 1/CD cents/kg/NN))
(S (cent_tariff 0.4/CD cents/kg/NN))
(S (cent_tariff 5.5/CD cents/kg/NN))
(S (cent_tariff 4.4/CD cents/kg/NN))
(S $/$ (dollar_tariff 3/head/CD))
(S (cent_tariff 68/CD cents/head/NN))
(S (cent_tariff 0.9/CD cents/NNS each/DT))
(S (cent_tariff 4/CD cents/NNS each/DT))
(S (cent_tariff 2/CD cents/kg/NN))
(S (cent_tariff 17.6/CD cents/kg/NN))
(S (cent_tariff 13.2/CD cents/kg/NN))
(S (cent_tariff 1.7/CD cents/kg/NN))
(S (cent_tariff 1.4/CD cents/kg/NN))
(S (cent_tariff 7.2/CD cents/kg/NN))
(S (cent_tariff 0.7/CD cents/kg/NN))
(S (cent_tariff 15.4/CD cents/kg/NN))
(S (cent_tariff 2.8/CD cents/kg/NN))
(S (cent_tariff 11/CD cents/kg/NN))
(S (cent_tariff 8.8/CD cents/kg/NN))
(S (cent_tariff 22/CD cents/kg/NN))
(S Not/RB available/JJ)
(S (cent_tariff 12.3/CD cents/kg/NN))
(S (cent_tariff 15/CD cents/kg/NN))
(S (cent_tariff 6/CD cents/kg/NN))
(S (cent_tariff 6.1/CD cents/kg/NN))
(S (cent_tariff 0.8/CD cents/kg/NN))
(S (ce

In [36]:
dat[dat.NonAdValorem!='   ']

Unnamed: 0,AdValorem,AffectedPartners,MeasureName,NonAdValorem,Partner,PartnerName,ProductCode,ProductDescription
49,1.96,,Non-MFN duty rate for countries excluded from...,6.6 cents/kg,N76,Non-MFN Countries for USA: 2006,01022920,Cows imported specially for dairy purposes
50,0.30,,Most Favoured Nation duty rate treatement,1 cents/kg,000,World,01022940,Live cattle other than purebred or those impo...
51,0.12,,Australia - United States Free Trade Agreemen...,0.4 cents/kg,036,Australia,01022940,Live cattle other than purebred or those impo...
68,1.63,,Non-MFN duty rate for countries excluded from...,5.5 cents/kg,N76,Non-MFN Countries for USA: 2006,01022940,Live cattle other than purebred or those impo...
72,0.30,,Most Favoured Nation duty rate treatement,1 cents/kg,000,World,01023900,Live buffalo
73,0.12,,Australia - United States Free Trade Agreemen...,0.4 cents/kg,036,Australia,01023900,Live buffalo
90,1.64,,Non-MFN duty rate for countries excluded from...,5.5 cents/kg,N76,Non-MFN Countries for USA: 2006,01023900,Live buffalo
92,0.28,,Most Favoured Nation duty rate treatement,1 cents/kg,000,World,01029000,Live bovine animals
93,0.11,,Australia - United States Free Trade Agreemen...,0.4 cents/kg,036,Australia,01029000,Live bovine animals
110,1.53,,Non-MFN duty rate for countries excluded from...,5.5 cents/kg,N76,Non-MFN Countries for USA: 2006,01029000,Live bovine animals
