In [17]:
# Imports
import pandas as pd
import numpy as np
from unidecode import unidecode

In [100]:
# ISO Countries names
countries = pd.read_csv(r'countries_ISO.txt')
countries = countries.rename(
    columns={'alpha-2': 'alpha_2', 'alpha-3': 'alpha_3'})

In [101]:
countries[countries['name'].str.contains('Russ')]

Unnamed: 0,name,alpha_2,alpha_3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
183,Russian Federation,RU,RUS,643,ISO 3166-2:RU,Europe,Eastern Europe,,150.0,151.0,


In [117]:
# Data frame with the total number of patents granted per country/year
df_pat = pd.read_csv(r'total_patent_grants.csv', header=7).fillna(0)
df_pat = df_pat.rename(columns={'Origin (Code)': 'alpha_2'})
df_pat

Unnamed: 0,Origin,alpha_2,Office,Type,1980,1981,1982,1983,1984,1985,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Albania,AL,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,3.0,12.0,6.0,0.0,12.0,4.0,0.0,7.0
1,Algeria,DZ,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,41.0,0.0,0.0,79.0,64.0,85.0,35.0,34.0,49.0,104.0
2,Andorra,AD,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,11.0,12.0,14.0,18.0
3,Angola,AO,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
4,Argentina,AR,Total,Total,1591.0,820.0,740.0,538.0,513.0,0.0,...,354.0,410.0,407.0,375.0,377.0,355.0,290.0,368.0,439.0,501.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Yemen,YE,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,14.0,11.0,8.0,2.0,0.0,24.0,9.0,24.0,6.0,1.0
170,Yugoslavia,YU,Total,Total,79.0,95.0,55.0,126.0,157.0,207.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,Zaire,ZR,Total,Total,3.0,8.0,6.0,6.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,Zambia,ZM,Total,Total,0.0,0.0,2.0,0.0,0.0,10.0,...,2.0,2.0,6.0,4.0,3.0,12.0,5.0,159.0,4.0,0.0


In [118]:
# Including ISO names in the patents data frame
df_pat = pd.merge(df_pat, countries[['name', 'alpha_2']], how='left', on='alpha_2')
df_pat

Unnamed: 0,Origin,alpha_2,Office,Type,1980,1981,1982,1983,1984,1985,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,name
0,Albania,AL,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.0,12.0,6.0,0.0,12.0,4.0,0.0,7.0,Albania
1,Algeria,DZ,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,79.0,64.0,85.0,35.0,34.0,49.0,104.0,Algeria
2,Andorra,AD,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,11.0,12.0,14.0,18.0,Andorra
3,Angola,AO,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,Angola
4,Argentina,AR,Total,Total,1591.0,820.0,740.0,538.0,513.0,0.0,...,410.0,407.0,375.0,377.0,355.0,290.0,368.0,439.0,501.0,Argentina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Yemen,YE,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,11.0,8.0,2.0,0.0,24.0,9.0,24.0,6.0,1.0,Yemen
170,Yugoslavia,YU,Total,Total,79.0,95.0,55.0,126.0,157.0,207.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
171,Zaire,ZR,Total,Total,3.0,8.0,6.0,6.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
172,Zambia,ZM,Total,Total,0.0,0.0,2.0,0.0,0.0,10.0,...,2.0,6.0,4.0,3.0,12.0,5.0,159.0,4.0,0.0,Zambia


In [120]:
# Missing names in Patents
miss_pat = df_pat.fillna('-')
miss_pat = miss_pat[miss_pat.name == '-']
miss_pat

Unnamed: 0,Origin,alpha_2,Office,Type,1980,1981,1982,1983,1984,1985,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,name
43,Czechoslovakia,CS,Total,Total,6768.0,5447.0,6116.0,6200.0,6267.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
62,German Democratic Republic,DD,Total,Total,4455.0,5713.0,4125.0,10709.0,11402.0,11487.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
108,Namibia,0,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,13.0,16.0,14.0,4.0,8.0,9.0,0.0,-
145,Soviet Union,SU,Total,Total,92909.0,96544.0,89305.0,72635.0,62755.0,73282.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
170,Yugoslavia,YU,Total,Total,79.0,95.0,55.0,126.0,157.0,207.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
171,Zaire,ZR,Total,Total,3.0,8.0,6.0,6.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-


In [121]:
# Dealing with missing country name in Patents
missing = {
    'German Democratic Republic': 'Germany',
    'Soviet Union': 'Russian Federation'
}
for k in missing.keys():
    i = df_pat[df_pat.Origin == k].index[0]
    df_pat.iloc[i, 46] = missing[k]


In [122]:
# Group values by names
df_pat = df_pat.drop(columns=['Origin', 'alpha_2', 'Office', 'Type']).dropna()
df_pat = df_pat.groupby(['name']).sum().reset_index()

In [131]:
# Data frame with the total population per country/year
df_pop = pd.read_csv(r'population.csv', header=2).drop(
    columns=['Indicator Name', 'Indicator Code']).rename(
        columns={'Country Code': 'alpha_3'})

df_pop

Unnamed: 0,Country Name,alpha_3,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,59522.0,...,102880.0,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,
1,Africa Eastern and Southern,AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,...,567891875.0,583650827.0,600008150.0,616377331.0,632746296.0,649756874.0,667242712.0,685112705.0,702976832.0,
2,Afghanistan,AFG,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,10010030.0,...,31541209.0,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,
3,Africa Western and Central,AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,...,387204553.0,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,
4,Angola,AGO,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,5827503.0,...,26147002.0,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,1106000.0,1135000.0,...,1818117.0,1812771.0,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1790133.0,1786038.0,
262,"Yemen, Rep.",YEM,5542459.0,5646668.0,5753386.0,5860197.0,5973803.0,6097298.0,6228430.0,6368014.0,...,26984002.0,27753304.0,28516545.0,29274002.0,30034389.0,30790513.0,31546691.0,32284046.0,32981641.0,
263,South Africa,ZAF,16520441.0,16989464.0,17503133.0,18042215.0,18603097.0,19187194.0,19789771.0,20410677.0,...,53873616.0,54729551.0,55876504.0,56422274.0,56641209.0,57339635.0,58087055.0,58801927.0,59392255.0,
264,Zambia,ZMB,3119430.0,3219451.0,3323427.0,3431381.0,3542764.0,3658024.0,3777680.0,3901288.0,...,15234976.0,15737793.0,16248230.0,16767761.0,17298054.0,17835893.0,18380477.0,18927715.0,19473125.0,


In [130]:
# Including ISO names in the population data frame
df_pop = pd.merge(
    df_pop, countries[['name', 'alpha_3']],
    how='left', 
    on='alpha_3'
    )

Unnamed: 0,Country Name,alpha_3,1960,1961,1962,1963,1964,1965,1966,1967,...,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66,name
0,Aruba,ABW,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,59522.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,-,Aruba
1,Africa Eastern and Southern,AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,...,583650827.0,600008150.0,616377331.0,632746296.0,649756874.0,667242712.0,685112705.0,702976832.0,-,-
2,Afghanistan,AFG,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,10010030.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,-,Afghanistan
3,Africa Western and Central,AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,-,-
4,Angola,AGO,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,5827503.0,...,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,-,Angola
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,1106000.0,1135000.0,...,1812771.0,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1790133.0,1786038.0,-,-
262,"Yemen, Rep.",YEM,5542459.0,5646668.0,5753386.0,5860197.0,5973803.0,6097298.0,6228430.0,6368014.0,...,27753304.0,28516545.0,29274002.0,30034389.0,30790513.0,31546691.0,32284046.0,32981641.0,-,Yemen
263,South Africa,ZAF,16520441.0,16989464.0,17503133.0,18042215.0,18603097.0,19187194.0,19789771.0,20410677.0,...,54729551.0,55876504.0,56422274.0,56641209.0,57339635.0,58087055.0,58801927.0,59392255.0,-,South Africa
264,Zambia,ZMB,3119430.0,3219451.0,3323427.0,3431381.0,3542764.0,3658024.0,3777680.0,3901288.0,...,15737793.0,16248230.0,16767761.0,17298054.0,17835893.0,18380477.0,18927715.0,19473125.0,-,Zambia


In [29]:
# Missing names in Patents
df_pop = df_pop.fillna('-')
miss_pop = miss_pop[miss_pop.name == '-']
miss_pop

Unnamed: 0,Country Name,alpha_3,1960,1961,1962,1963,1964,1965,1966,1967,...,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66,name
1,Africa Eastern and Southern,AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,...,583650827.0,600008150.0,616377331.0,632746296.0,649756874.0,667242712.0,685112705.0,702976832.0,-,-
3,Africa Western and Central,AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,...,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,-,-
7,Arab World,ARB,93359407.0,95760348.0,98268683.0,100892507.0,103618568.0,106444103.0,109394536.0,112499764.0,...,397922915.0,406501999.0,415077960.0,423664839.0,432545676.0,441467739.0,449228296.0,456520777.0,-,-
36,Central Europe and the Baltics,CEB,91401764.0,92232738.0,93009498.0,93840016.0,94715795.0,95440988.0,96146336.0,97043270.0,...,103496179.0,103257886.0,102994278.0,102740078.0,102538451.0,102398537.0,102180124.0,101430997.0,-,-
38,Channel Islands,CHI,109186.0,110225.0,111281.0,112410.0,113596.0,114832.0,116116.0,117430.0,...,160912.0,162190.0,163721.0,165215.0,167259.0,169410.0,171113.0,172683.0,-,-
49,Caribbean small states,CSS,4209141.0,4289429.0,4366420.0,4443544.0,4520592.0,4596245.0,4670465.0,4743053.0,...,7181044.0,7224602.0,7265272.0,7303634.0,7374650.0,7424102.0,7444768.0,7481877.0,-,-
61,East Asia & Pacific (excluding high income),EAP,896482332.0,896012881.0,907880207.0,931136006.0,954010411.0,977517019.0,1004358015.0,1030478704.0,...,2034317097.0,2049809214.0,2065223450.0,2080968782.0,2094573278.0,2106439246.0,2116424876.0,2123673456.0,-,-
62,Early-demographic dividend,EAR,979461502.0,1004319366.0,1029962253.0,1056327420.0,1083430197.0,1110603410.0,1137875812.0,1166092667.0,...,3122586392.0,3166642585.0,3210110979.0,3252529883.0,3294298709.0,3335463995.0,3375134276.0,3411889059.0,-,-
63,East Asia & Pacific,EAS,1043333636.0,1045203037.0,1059600211.0,1085398906.0,1110819272.0,1136927045.0,1166227679.0,1194567141.0,...,2278232287.0,2294507020.0,2310721864.0,2327134580.0,2341387076.0,2353862247.0,2363940425.0,2370204347.0,-,-
64,Europe & Central Asia (excluding high income),ECA,255726092.0,259951519.0,264183560.0,268409373.0,272634858.0,276765568.0,280229181.0,283724914.0,...,388842371.0,391695432.0,394321096.0,396482489.0,398076771.0,399592320.0,400811771.0,401575218.0,-,-


In [132]:
# Dropping missing names since they are consolidated data, not countries
df_pop = df_pop.dropna()