In [1]:
import binet as bnt



#### Import data

In [2]:
#Either directly from Atlas of Economic Complexity
wt, pr, co = bnt.trade_data('hs02')
wt = wt[(wt.ccode != 'xxa')&(wt.ccode != 'xxb')]

#Or from file if you have it.
# wt = pd.read_csv('wt.csv')
# pr = pd.read_csv('pr.csv')
# co = pd.read_csv('co.csv')

Retrieving trade data for hs02
Downloading country names from http://atlas.media.mit.edu/static/db/raw/country_names.tsv.bz2
Downloading product names from http://atlas.media.mit.edu/static/db/raw/products_hs_02.tsv.bz2
Downloading trade   data  from http://atlas.media.mit.edu/static/db/raw/year_origin_hs02_4.tsv.bz2


In [4]:
x = bnt.calculatepRCA(wt, y ='year',c='ccode',p='pcode',x='x')

In [5]:
x.sample(10)

Unnamed: 0,year,ccode,pcode,x,RCA,log(x),T,log(RCA),RCA_y+1,pRCA
196670,2004,swe,210,4680435.07,0.132141,6.670286,7.549247,-0.878961,0,0.005
1399764,2013,arg,6808,36134.45,0.021682,4.557921,6.22183,-1.663909,0,0.0
77404,2003,blr,3914,52900.0,0.04443,4.723456,6.075783,-1.352328,0,0.025
909250,2010,uga,6302,163157.43,0.076392,5.212607,6.329558,-1.116951,0,0.015
59691,2003,mar,7419,829305.83,0.3868,5.918715,6.331228,-0.412513,0,0.06
673363,2008,afg,9017,2580.0,0.03554,3.41162,4.860906,-1.449286,0,0.075
445557,2006,swe,5204,135490.99,0.048486,5.13191,6.446293,-1.314382,0,0.01
389941,2006,ant,8421,488619.0,0.049299,5.68897,6.996136,-1.307166,0,0.005
850313,2009,khm,4001,97506140.71,14.072782,7.989032,6.840652,1.14838,1,0.985
1137568,2011,mkd,5101,3070792.74,2.272824,6.487251,6.130685,0.356566,1,0.86


### pRCA() and  calculateRCA_by_year() functions source
##### In case you want to see how it computes. It looks scary but it's actually simple.

In [7]:
#The function calculatepRCA computes pRCA by fitting k-nearest neighbors.
#It tells you the chances to have RCA > 1 next year, given log(x) and T.

from sklearn import neighbors

def calculatepRCA(data, y ='',c='',p='',x=''):
    '''
    Returns the pRCA from data. pRCA is the probability that (RCA_{y+1} > 1) given the volume of exports (x_{cpy}),
    and the 'baseline term' (\sum_c x_{cpy}  \sum_p x_{cpy} / \sum_c \sum_p x_{cpy}).
    It is computed using k-nearest neighbors, in the space of log exports and log baseline term.
    Parameters
    ----------
    data : pandas.DataFrame
        Raw data. It has source,target,volume (trade, number of people etc.).
    y,c,p,x : str (optional)
        Labels of the columns in data used for source,target,volume
    Returns
    -------
    RCA : pandas.DataFrame
        Table with the RCAs, with the columns c,p,x,RCA
        If shares is True it also includes:
            s_c : Share of X_cp over X_c
            s_p : Share of X_cp over X_p
    '''
    df = calculateRCA_by_year(data,y ='year',c='ccode',p='pcode',x='x',log_terms = True)
        
    #Compute (RCA > 1) next year and merge it
    df_ = df.copy()
    df_['year'] = df_['year'] - 1
    df_['RCA_y+1'] = (df_['log(RCA)'] > 0).astype(int)
    df_ = df_[['year','ccode','pcode','RCA_y+1']]
    df = df.merge(df_)
    
    #Prepare dataset for knn and fit
    M = df[['log(x)','T','RCA_y+1']].as_matrix()
    X, y = M[:,:2], M[:, 2] 
    knn = neighbors.KNeighborsRegressor(n_neighbors = 200, weights = 'uniform').fit(X, y)

    #To avoid memory error, compute predictions in split X. Predictions are output pRCA
    pRCA = np.array([])
    for x in np.array_split(X, 10):
        pRCA = np.append(pRCA, knn.predict(x))
    df['pRCA'] = pRCA
    
    return df

#The function calculateRCA_by_year takes care of the year column. See examples below and compare with bnt.calculateRCA

def calculateRCA_by_year(data,y ='',c='',p='',x='',shares=False, log_terms = False):
    '''
    This function handles input data from more than one year.
    Returns the RCA expressed in data. All RCA values belong to a country-product-year.
    Parameters
    ----------
    data : pandas.DataFrame
        Raw data. It has year,source,target,volume (trade, number of people etc.).
    y,c,p,x : str (optional)
        Labels of the columns in data used for source,target,volume
    shares : boolean (False)
        If True it will also return the shares used to calculate the RCA
    log_terms: boolean(False)
        If True it instead returns the log exports log(x), log 'baseline term' log(\sum_c x_{cpy}  \sum_p x_{cpy} / \sum_c \sum_p x_{cpy})
    and log(RCA), which is by definition the diference of these two.
    Returns
    -------
    RCA : pandas.DataFrame
        Table with the RCAs, with the columns c,p,x,RCA
        If shares is True it includes:
            s_c : Share of X_cp over X_c
            s_p : Share of X_cp over X_p
        If log_terms is True, it instead includes:
            log(x) : log of exports
            T : log of the baseline term, which is market size of product * market size of country / total world trade
            log(RCA) : log of RCA computed as log(x) - T
            
    '''
    y = data.columns.values[0] if y == '' else y
    c = data.columns.values[1] if c == '' else c
    p = data.columns.values[2] if p == '' else p
    x = data.columns.values[3] if x == '' else x
    data_ = data[[y,c,p,x]]
    
    data_ = pd.merge(data_,data_.groupby([c,y]).sum()[[x]].rename(columns={x:x+'_'+c+'_'+y}).reset_index()
               ,how='inner',left_on=[y,c],right_on=[y,c]) #This is Tc
    data_ = pd.merge(data_,data_.groupby([p,y]).sum()[[x]].rename(columns={x:x+'_'+p+'_'+y}).reset_index()
                  ,how='inner',left_on=[y,p],right_on=[y,p])
    data_ = pd.merge(data_,data_.groupby(y).sum()[[x]].rename(columns={x:x+'_'+y}).reset_index()
                  ,how='inner',left_on=y,right_on=y)

    data_['RCA'] = (data_[x].astype(float)/data_[x+'_'+p+'_'+y].astype(float))/(data_[x+'_'+c+'_'+y].astype(float)/data_[x+'_'+y].astype(float))

    if shares:
        data_['s_'+c] = (data_[x].astype(float)/data_[x+'_'+c+'_'+y].astype(float)) 
        data_['s_'+p] = (data_[x].astype(float)/data_[x+'_'+p+'_'+y].astype(float))
        return data_[[y,c,p,x,'RCA','s_'+c,'s_'+p]]
    if log_terms:
        data_['log(x)'] = np.log10(data_[x].astype(float))
        data_['T'] = -np.log10((1/data_[x+'_'+p+'_'+y].astype(float))/(data_[x+'_'+c+'_'+y].astype(float)/data_[x+'_'+y].astype(float)))
        data_['log(RCA)'] = data_['log(x)'] - data_['T']
        return data_[[y,c,p,x,'RCA','log(x)','T','log(RCA)']]
    return data_[[y,c,p,x,'RCA']]

Unnamed: 0,year,ccode,pcode,x,RCA,log(x),T,log(RCA),RCA_y+1,pRCA
56246,2005,usa,8521,332841966.58,0.19,8.52,9.24,-0.72,0,0.0
32868,2004,ind,8302,154987701.5,1.08,8.19,8.16,0.04,1,0.64
120094,2009,irl,302,121482247.88,1.09,8.08,8.05,0.04,1,0.73
73910,2006,blx,8429,2377562048.65,2.13,9.38,9.05,0.33,1,0.98
156213,2010,dnk,8523,21411384.4,0.14,7.33,8.2,-0.87,0,0.0
918,2003,fin,712,55534.52,0.01,4.74,6.97,-2.23,0,0.0
27694,2004,hun,5806,5058421.1,0.3,6.7,7.23,-0.53,0,0.01
197100,2012,rus,8543,121770450.79,0.22,8.09,8.75,-0.66,0,0.0
149834,2010,swe,6405,15263332.2,0.47,7.18,7.51,-0.33,0,0.02
68511,2006,aut,6117,14643380.58,0.6,7.17,7.39,-0.22,0,0.06
