Data Wrangling das Bolsas de Doutorado pelo Mundo

Valor das bolsas coletado em 3 de Julho de 2022

André Luzardo, 4 de Julho de 2022

In [1]:
import pandas as pd
from scipy import stats

In [2]:
bolsas = pd.read_csv('BolsasGlassdoor.csv', thousands=',')
big_mac = pd.read_csv('big-mac-source-data.csv')
citations = pd.read_csv('scimagojr country rank 1996-2020.csv')
DollarPPP = pd.read_csv('DollarPPP.csv')

In [3]:
# Use 'bolsas' as main dataframe
# Merge bolsas with selected cols from big_mac
bolsas=pd.merge(bolsas,big_mac.loc[(big_mac['date']=='2022-01-01'),{'local_price','dollar_ex','GDP_dollar','iso_a3'}],left_on='country_code',right_on='iso_a3').rename(columns={'local_price':'bigmac_lcu'})

In [4]:
# Merge bolsas with selected cols from DollarPPP
bolsas=pd.merge(bolsas,DollarPPP.loc[(DollarPPP['TIME']==2021),{'LOCATION','Value'}],left_on='country_code',right_on='LOCATION',how='left').rename(columns={'Value':'ppp'})

In [5]:
# Merge bolsas with selected cols from citations
bolsas=pd.merge(bolsas.replace(to_replace={'Britain':'United Kingdom','New Zeland':'New Zealand'}),citations[{'Country','Citations per document','H index'}].replace(to_replace={'Russian Federation':'Russia'}),left_on='país',right_on='Country',how='left')

In [6]:
# drop repeated cols and set country_code as index
bolsas=bolsas.drop(columns=['iso_a3','LOCATION','país']).set_index('country_code')

In [7]:
bolsas

Unnamed: 0_level_0,bolsa_lcu,dollar_ex,GDP_dollar,bigmac_lcu,ppp,Country,H index,Citations per document
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ARG,1158648,105.0165,8571.94,450.0,,Argentina,477,17
AUS,30000,1.419346,52905.4,6.4,1.438979,Australia,1115,23
AUT,27912,0.892061,48592.71,3.8,0.770831,Austria,740,24
BEL,28200,0.892061,44688.46,4.55,0.742723,Belgium,886,26
BRA,32664,5.31,6822.56,22.9,2.53,Brazil,649,13
GBR,16815,0.745351,40394.13,3.59,0.692802,United Kingdom,1618,25
CAN,25567,1.2718,43294.8,6.77,1.253066,Canada,1299,26
CHL,9380028,798.6,12992.98,3100.0,430.349555,Chile,441,17
CHN,43764,6.36535,10511.34,24.4,4.18,China,1010,10
CZE,299460,21.6802,22942.86,89.0,12.919712,Czech Republic,524,13


In [8]:
# convert the annual scholarship amount to dollars
bolsas['bolsa_dollar']=bolsas['bolsa_lcu']/bolsas['dollar_ex']
# convert big mac price to dollars
bolsas['bigmac_dollar']=bolsas['bigmac_lcu']/bolsas['dollar_ex']

In [9]:
bolsas.describe()

Unnamed: 0,bolsa_lcu,dollar_ex,GDP_dollar,bigmac_lcu,ppp,H index,Citations per document,bolsa_dollar,bigmac_dollar
count,51.0,51.0,51.0,51.0,40.0,51.0,51.0,51.0,51.0
mean,5434967.0,625.271216,29933.91549,1555.934706,278.730479,720.333333,18.176471,19747.537814,4.033228
std,27607920.0,2814.276934,22200.757795,6664.946661,1046.864921,424.903267,6.757828,14352.69244,1.198246
min,10908.0,0.745351,1254.86,2.8,0.540124,222.0,5.0,2000.0,1.743792
25%,26283.5,0.892061,10173.345,4.7,0.742414,433.0,12.0,9030.027404,3.115802
50%,45606.0,4.1855,25548.77,17.0,1.61616,630.0,18.0,15550.512,3.997974
75%,339156.0,21.1646,45452.03,72.0,9.766887,930.0,23.5,28163.432325,4.904375
max,192000000.0,14382.0,87366.6,34000.0,4738.64,2577.0,31.0,57941.801583,6.981365


In [10]:
bolsas.to_csv('bolsas.csv')