In [1]:
# Imports
import pandas as pd
import numpy as np
from unidecode import unidecode

In [2]:
# Data frame with the total number of patents granted per country/year
df_pat = pd.read_csv(r'total_patent_grants.csv', header=7).fillna(0)

df_pat

Unnamed: 0,Origin,Origin (Code),Office,Type,1980,1981,1982,1983,1984,1985,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Albania,AL,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,3.0,12.0,6.0,0.0,12.0,4.0,0.0,7.0
1,Algeria,DZ,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,41.0,0.0,0.0,79.0,64.0,85.0,35.0,34.0,49.0,104.0
2,Andorra,AD,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,11.0,12.0,14.0,18.0
3,Angola,AO,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
4,Argentina,AR,Total,Total,1591.0,820.0,740.0,538.0,513.0,0.0,...,354.0,410.0,407.0,375.0,377.0,355.0,290.0,368.0,439.0,501.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Yemen,YE,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,14.0,11.0,8.0,2.0,0.0,24.0,9.0,24.0,6.0,1.0
170,Yugoslavia,YU,Total,Total,79.0,95.0,55.0,126.0,157.0,207.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,Zaire,ZR,Total,Total,3.0,8.0,6.0,6.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,Zambia,ZM,Total,Total,0.0,0.0,2.0,0.0,0.0,10.0,...,2.0,2.0,6.0,4.0,3.0,12.0,5.0,159.0,4.0,0.0


In [3]:
# Data frame with the total population per country/year
df_pop = pd.read_csv(r'population.csv', header=2).drop(
    columns=['Indicator Name', 'Indicator Code']).rename(
        columns={'Country Name': 'Country'})

df_pop.head()

Unnamed: 0,Country,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,59522.0,...,102880.0,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,
1,Africa Eastern and Southern,AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,...,567891875.0,583650827.0,600008150.0,616377331.0,632746296.0,649756874.0,667242712.0,685112705.0,702976832.0,
2,Afghanistan,AFG,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,10010030.0,...,31541209.0,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,
3,Africa Western and Central,AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,...,387204553.0,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0,478185907.0,
4,Angola,AGO,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,5827503.0,...,26147002.0,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0,34503774.0,


In [4]:
# Creating an standard name for the countries
countries = []
for w in [unidecode(x) for x in np.unique(df_pat.Origin.values)]:
    if w in [unidecode(x) for x in np.unique(df_pop.Country.values)]:
        countries.append([w, w])
    else:
         for k in np.unique(df_pop.Country.values):
            if k in w:
                countries.append([w, k])   
countries = pd.DataFrame(countries, columns=['Origin', 'Country'])        
countries

Unnamed: 0,Origin,Country
0,Albania,Albania
1,Algeria,Algeria
2,Andorra,Andorra
3,Angola,Angola
4,Argentina,Argentina
...,...,...
149,United States of America,United States
150,Uruguay,Uruguay
151,Uzbekistan,Uzbekistan
152,Zambia,Zambia


In [5]:
df_pat = pd.merge(df_pat, countries, on='Origin', how='outer').fillna('-')
#df_pat = df_pat.drop(columns=['Country_x', 'Country_y'])
df_pat

Unnamed: 0,Origin,Origin (Code),Office,Type,1980,1981,1982,1983,1984,1985,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Country
0,Albania,AL,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.0,12.0,6.0,0.0,12.0,4.0,0.0,7.0,Albania
1,Algeria,DZ,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,79.0,64.0,85.0,35.0,34.0,49.0,104.0,Algeria
2,Andorra,AD,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,11.0,12.0,14.0,18.0,Andorra
3,Angola,AO,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,Angola
4,Argentina,AR,Total,Total,1591.0,820.0,740.0,538.0,513.0,0.0,...,410.0,407.0,375.0,377.0,355.0,290.0,368.0,439.0,501.0,Argentina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Zaire,ZR,Total,Total,3.0,8.0,6.0,6.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
172,Zambia,ZM,Total,Total,0.0,0.0,2.0,0.0,0.0,10.0,...,2.0,6.0,4.0,3.0,12.0,5.0,159.0,4.0,0.0,Zambia
173,Zimbabwe,ZW,Total,Total,24.0,9.0,19.0,16.0,17.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zimbabwe
174,Cote d'Ivoire,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,Cote d'Ivoire


In [6]:
df_pat[df_pat.Country == '-']

Unnamed: 0,Origin,Origin (Code),Office,Type,1980,1981,1982,1983,1984,1985,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Country
9,Bahamas,BS,Total,Total,0.0,1.0,2.0,0.0,4.0,2.0,...,90.0,154.0,208.0,212.0,98.0,0.0,0.0,0.0,0.0,-
37,Congo,CG,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,17.0,51.0,17.0,102.0,69.0,17.0,68.0,170.0,-
39,Côte d'Ivoire,CI,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,374.0,323.0,221.0,443.0,528.0,289.0,375.0,425.0,-
43,Czechoslovakia,CS,Total,Total,6768.0,5447.0,6116.0,6200.0,6267.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
44,Czech Republic,CZ,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,879.0,976.0,1103.0,1310.0,1438.0,1391.0,1510.0,1346.0,1247.0,-
45,Democratic People's Republic of Korea,KP,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5259.0,4870.0,4724.0,4256.0,0.0,-
46,Democratic Republic of the Congo,CD,Total,Total,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,30.0,30.0,16.0,0.0,-
51,Egypt,EG,Total,Total,10.0,8.0,3.0,6.0,7.0,7.0,...,127.0,128.0,140.0,126.0,148.0,240.0,50.0,115.0,128.0,-
62,German Democratic Republic,DD,Total,Total,4455.0,5713.0,4125.0,10709.0,11402.0,11487.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
76,Iran (Islamic Republic of),IR,Total,Total,36.0,65.0,23.0,9.0,4.0,9.0,...,3414.0,2923.0,0.0,3155.0,3726.0,3057.0,2580.0,3444.0,0.0,-


In [13]:
df_pop[df_pop['Country'].str.contains('Tur')]

Unnamed: 0,Country,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
228,Turks and Caicos Islands,TCA,5604.0,5625.0,5633.0,5634.0,5642.0,5650.0,5652.0,5662.0,...,33594.0,34985.0,36538.0,38246.0,39844.0,41487.0,43080.0,44276.0,45114.0,
235,Turkmenistan,TKM,1602052.0,1658569.0,1716868.0,1776387.0,1836621.0,1897095.0,1957557.0,2018024.0,...,5560095.0,5663152.0,5766431.0,5868561.0,5968383.0,6065066.0,6158420.0,6250438.0,6341855.0,
244,Turkiye,TUR,27510980.0,28255002.0,29033647.0,29827877.0,30612821.0,31374536.0,32172785.0,33026490.0,...,76576117.0,78112073.0,79646178.0,81019394.0,82089826.0,82809304.0,83481684.0,84135428.0,84775404.0,
