# <span style="color:#d3d1df">Discovery & Preperation for European Service Trade Data</span>

## <span style="color:#f1c232">Environment</span>

In [47]:
#Packages
import pandas as pd
import eurostat  


---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## <span style="color:#f1c232">Data Discovery</span>

Let us initiate our analysis by getting the parameters of the dataset and the values contained in that dataset.

In [84]:
for i in  eurostat.get_pars('EXT_STEC01'): print(i,eurostat.get_dic('EXT_STEC01',i, full=False)) 

freq [('A', 'Annual')]
unit [('THS_EUR', 'Thousand euro')]
sizeclas [('TOTAL', 'Total'), ('0-49', 'From 0 to 49 employees'), ('LT10', 'Fewer than 10 employees'), ('10-49', 'From 10 to 49 employees'), ('50-249', 'From 50 to 249 employees'), ('GE250', '250 employees or more'), ('UNK', 'Unknown')]
stk_flow [('IMP', 'Imports'), ('EXP', 'Exports')]
nace_r2 [('TOTAL', 'Total - all NACE activities'), ('A_B', 'Agriculture, forestry and fishing; mining and quarrying'), ('C', 'Manufacturing'), ('D_E', 'Electricity, gas, steam and air conditioning supply; water supply; sewerage, waste management and remediation activities'), ('F', 'Construction'), ('G', 'Wholesale and retail trade; repair of motor vehicles and motorcycles'), ('H', 'Transportation and storage'), ('I_L_O-U', 'Accommodation and food service activities; real estate activities; public administration, defence; compulsory social security; education; human health and social work activities; other services'), ('J', 'Information and commun

**Observations:** <br>

* **freq** column will be redundant (deleted).
* **sizeclas** column will not be used, only *TOTAL* observations will be selected and then the column can be removed.
* **partner** column will not be used, only *WORLD* observations will be selected and then the column can be removed.
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


## <span style="color:#f1c232">Data Preperation</span>

Get the dataset via Eurostat API.

In [76]:
#Get the dataset via Eurostat API.
df=eurostat.get_data_df('EXT_STEC01', flags=True)
df

Unnamed: 0,freq,unit,sizeclas,stk_flow,nace_r2,partner,geo\TIME_PERIOD,2013_value,2013_flag,2014_value,...,2015_value,2015_flag,2016_value,2016_flag,2017_value,2017_flag,2018_value,2018_flag,2019_value,2019_flag
0,A,THS_EUR,0-49,EXP,A_B,EU27_2020,EE,,:,,...,,:,,:,,:,,:,16104.0,
1,A,THS_EUR,0-49,EXP,A_B,EU27_2020,IE,,:,,...,,:,,:,,:,,: c,,: c
2,A,THS_EUR,0-49,EXP,A_B,EU27_2020,LT,,:,,...,,:,,:,,: c,,: c,,: c
3,A,THS_EUR,0-49,EXP,A_B,EU27_2020,NO,,:,,...,,:,,:,,:,,:,,: c
4,A,THS_EUR,0-49,EXP,A_B,EXT_EU27_2020,EE,,:,,...,,:,,:,,:,,:,144.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4181,A,THS_EUR,UNK,IMP,UNK,WORLD,LU,6298265.0,,5962528.0,...,5522017.0,,4845412.0,,5370744.0,,7333921.0,,,:
4182,A,THS_EUR,UNK,IMP,UNK,WORLD,NL,45630056.0,,58631521.0,...,92582687.0,,57554140.0,,63163970.0,,69427476.0,,,:
4183,A,THS_EUR,UNK,IMP,UNK,WORLD,NO,20288301.0,,,...,15084812.0,,14379122.0,,,:,,:,8572654.0,
4184,A,THS_EUR,UNK,IMP,UNK,WORLD,PL,,:,,...,,:,11753681.0,,,:,13970364.0,,13999084.0,


Exclude the observations and variables that will not be used in the analysis.

In [77]:
#Gender, earning type, and company size will not be examined in the research.  
df=df[(df['partner']=='WORLD')&(df['sizeclas']=='TOTAL')]
df=df.drop(['freq','sizeclas','partner'], axis=1)

#Rename Country code column
df=df.rename(columns={'geo\TIME_PERIOD':'code'})

df


Unnamed: 0,unit,stk_flow,nace_r2,code,2013_value,2013_flag,2014_value,2014_flag,2015_value,2015_flag,2016_value,2016_flag,2017_value,2017_flag,2018_value,2018_flag,2019_value,2019_flag
2998,THS_EUR,EXP,A_B,AT,16173.0,,,:,17206.0,,33066.0,,22770.0,,112952.0,,140664.0,
2999,THS_EUR,EXP,A_B,BE,,:,149565.0,,,:,,:,,:,,:,,:
3000,THS_EUR,EXP,A_B,CZ,18035.0,,,:,,:,,:,,:,,:,,:
3001,THS_EUR,EXP,A_B,DK,215102.0,,250536.0,,,: c,359291.0,,,:,,:,,:
3002,THS_EUR,EXP,A_B,EE,6259.0,,9565.0,,8361.0,,11011.0,,6876.0,,12031.0,,16406.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3583,THS_EUR,IMP,UNK,LU,6298265.0,,5965785.0,,5522177.0,,5535916.0,,6153912.0,,8855939.0,,,:
3584,THS_EUR,IMP,UNK,NL,45630056.0,,58631521.0,,92582687.0,,57554140.0,,63163970.0,,69427476.0,,,:
3585,THS_EUR,IMP,UNK,NO,20288301.0,,,:,17044913.0,,20406737.0,,,:,,:,21375144.0,
3586,THS_EUR,IMP,UNK,PL,,:,,:,,:,11753681.0,,,:,13970364.0,,0.0,


Transform the data from the long form into the short form.

In [78]:
df_temp = df.melt(id_vars=['nace_r2','stk_flow','unit','code'], var_name='Cols')
df_temp['year'],df_temp['Cols']=df_temp['Cols'].apply(lambda x : x[0:4]),df_temp['Cols'].apply(lambda x : x[5:])
df=df_temp[(df_temp['Cols']=='value')].merge(df_temp[(df_temp['Cols']=='flag')],on=['nace_r2','stk_flow','unit','code','year'],how='outer').rename(columns={'value_x':'value','value_y':'flag'})
del df_temp
df=df.drop(['Cols_x','Cols_y'], axis=1)
df

Unnamed: 0,nace_r2,stk_flow,unit,code,value,year,flag
0,A_B,EXP,THS_EUR,AT,16173.0,2013,
1,A_B,EXP,THS_EUR,BE,,2013,:
2,A_B,EXP,THS_EUR,CZ,18035.0,2013,
3,A_B,EXP,THS_EUR,DK,215102.0,2013,
4,A_B,EXP,THS_EUR,EE,6259.0,2013,
...,...,...,...,...,...,...,...
2725,UNK,IMP,THS_EUR,LU,,2019,:
2726,UNK,IMP,THS_EUR,NL,,2019,:
2727,UNK,IMP,THS_EUR,NO,21375144.0,2019,
2728,UNK,IMP,THS_EUR,PL,0.0,2019,


Re-arrange the indexes.

In [79]:
df=df.set_index(['code','year','nace_r2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,stk_flow,unit,value,flag
code,year,nace_r2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AT,2013,A_B,EXP,THS_EUR,16173.0,
BE,2013,A_B,EXP,THS_EUR,,:
CZ,2013,A_B,EXP,THS_EUR,18035.0,
DK,2013,A_B,EXP,THS_EUR,215102.0,
EE,2013,A_B,EXP,THS_EUR,6259.0,
...,...,...,...,...,...,...
LU,2019,UNK,IMP,THS_EUR,,:
NL,2019,UNK,IMP,THS_EUR,,:
NO,2019,UNK,IMP,THS_EUR,21375144.0,
PL,2019,UNK,IMP,THS_EUR,0.0,


---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## <span style="color:#f1c232">Data analysis</span>

To understand how different patterns of service outsourcing affect the distribution of labor earnings, we start our analysis by understanding the levels.