In [22]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"



def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


In [23]:
class CacheQuery(object):
    def __init__(self, endpoint_url="https://query.wikidata.org/sparql"):
        self.endpoint_url = endpoint_url
        self.cache = {}
    def query(self, query):
        if query not in self.cache:
            user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
        # TODO adjust user agent; see https://w.wiki/CX6
            sparql = SPARQLWrapper(self.endpoint_url, agent=user_agent)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            self.cache[query] = sparql.query().convert()
        return self.cache[query]
    

In [206]:
wiki = CacheQuery()

In [231]:
import munch
query = '''
SELECT DISTINCT ?country ?countryLabel ?population (GROUP_CONCAT(DISTINCT ?langCode;separator=", ") AS ?langCodes) 
WHERE
{
  ?country wdt:P31 wd:Q3624078;
           wdt:P1082 ?population.
  OPTIONAL {
  ?country wdt:P37 ?lang .
  ?lang wdt:P424 ?langCode .
    }
#   BIND("pt" as ?desiredLang).
#   FILTER(STRSTARTS(?langCode, ?desiredLang)).
#   ?lang wdt:P31/wdt:P279* wd:Q1860 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
GROUP BY ?country ?countryLabel ?population
ORDER BY DESC(?population)
'''

results = wiki.query(query)

countries = []
for x in results['results']['bindings']:
    a = munch.Munch()
    a.code = x['country']['value'].split('/')[-1]
    a.label = munch.Munch()
    a.label.en = x['countryLabel']['value']
    a.lang_codes = x['langCodes']['value']
    a.primary_lang = x['langCodes']['value'].split(',')[0].split('-')[0]
    a.population = x['population']['value']
    countries.append(a)
countries[0].primary_lang='zh'

In [298]:
# countries[0].label
countries[0].label.en = 'China'
countries[2].label.en = 'United States'
countries[15].label.en = 'Congo'
countries[65].label.en = 'the Netherlands'
countries[113].label.en = 'Denmark'
countries[117].label.en = 'Congo'

In [241]:
roberta_languages = ['fr', 'en', 'de', 'af', 'sq', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bn', 'bs', 'br', 'bg', 'my', 'ca', 'zh', 'hr', 'cs', 'da', 'nl', 'eo', 'et', 'fi', 'gl', 'ka', 'el', 'gu', 'ha', 'he', 'hi', 'hu', 'is', 'id', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'lt', 'mk', 'mg', 'ms', 'ml', 'mr', 'mn', 'ne', 'no', 'ps', 'fa', 'pl', 'pt', 'pa', 'ro', 'ru', 'gd', 'sr', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'ug', 'uz', 'vi', 'cy', 'xh', 'yi']
def extract_roberta(input_dict, verbose=False):
    ret = {}
    for x in roberta_languages:
        if x in input_dict:
            if x == 'zh':
                ret['zh'] = input_dict.get('zh-cn', input_dict.get('zh'))
            else:
                ret[x] = input_dict[x]
        if x not in input_dict:
            keys = [a for a in input_dict.keys() if a.startswith(x)]
            if keys:
                ret[x] = input_dict[keys[0]]
            else:
                pass
                if verbose:
                    print(x)
    return ret

In [235]:
print(roberta_languages)

['fr', 'en', 'de', 'af', 'sq', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bn', 'bs', 'br', 'bg', 'my', 'ca', 'zh', 'hr', 'cs', 'da', 'nl', 'eo', 'et', 'fi', 'gl', 'ka', 'el', 'gu', 'ha', 'he', 'hi', 'hu', 'is', 'id', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'lt', 'mk', 'mg', 'ms', 'ml', 'mr', 'mn', 'ne', 'no', 'ps', 'fa', 'pl', 'pt', 'pa', 'ro', 'ru', 'gd', 'sr', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'ug', 'uz', 'vi', 'cy', 'xh', 'yi']


In [294]:
query = '''
SELECT DISTINCT ?country ?countryLabel ?lang 
{
  ?country rdfs:label ?countryLabel .
  FILTER(?country in (wd:%s)).
  BIND(lang(?countryLabel) as ?lang)
}
'''
for country in countries:
    print(country.label.en)
    results = wiki.query(query % country.code)
    langval = {}
    for x in results["results"]["bindings"]:
#         print(x)
        lang = x['lang']['value']
        val = x['countryLabel']['value']
        langval[lang]= val
    roberta_labels = extract_roberta(langval)
    for x, v in roberta_labels.items():
        country.label[x] = v

    

People's Republic of China
India
United States of America
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Democratic Republic of the Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
Kingdom of the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Sa

In [299]:
query = '''
SELECT DISTINCT ?city ?cityLabel ?population
{
  ?city wdt:P31/wdt:P279* wd:Q515 .
  ?city wdt:P1082 ?population .
  ?city wdt:P17 ?country .
  ?city rdfs:label ?cityLabel .
  FILTER(lang(?cityLabel)="en").
  FILTER(?country in (wd:%s)).
}
ORDER BY DESC(?population) LIMIT 100
'''
for country in countries:
    country.cities = []
    results = wiki.query(query % country.code)
    print(country.label.en)
    for x in results["results"]["bindings"]:
        a = munch.Munch()
        a.code = x['city']['value'].split('/')[-1]
        a.label = munch.Munch()
        a.label.en = x['cityLabel']['value']
        a.population = x['population']['value']
        country.cities.append(a)
    

China
India
United States
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Salvador
Nicaragua
Kyrgyzstan
Lebanon
Singapore
Turkmenistan
Denmark
Den

In [300]:
# countries[0].cities[0].label

In [301]:
query = '''
SELECT DISTINCT ?city ?cityLabel ?lang 
{
  ?city rdfs:label ?cityLabel .
  FILTER(?city in (wd:%s)).
  BIND(lang(?cityLabel) as ?lang)
}
'''
for country in countries:
    print(country.label.en)
    for city in country.cities:
        results = wiki.query(query % city.code)
        langval = {}
        for x in results["results"]["bindings"]:
            lang = x['lang']['value']
            val = x['cityLabel']['value']
            langval[lang]= val
        roberta_labels = extract_roberta(langval)
        for x, v in roberta_labels.items():
            city.label[x] = v
#         for x in roberta_languages:
#             if x in langval:
#                 if x == 'zh':
#                     city.label['zh'] = langval.get('zh-cn', langval.get('zh'))
#                 city.label[x] = langval[x]
#             if x not in langval:
#                 keys = [a for a in langval.keys() if a.startswith(x)]
#                 if keys:
#                     city.label[keys[0]] = langval[keys[0]]

    

China
India
United States
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Salvador
Nicaragua
Kyrgyzstan
Lebanon
Singapore
Turkmenistan
Denmark
Den

nationalities

In [61]:
results

{'head': {'vars': ['nat', 'country']}, 'results': {'bindings': []}}

In [65]:
query = '''
SELECT DISTINCT ?nat ?country
{
  ?country wdt:P1549 ?nat .
  FILTER(lang(?nat)="en").
  FILTER(?country in (%s)).
}'''

nationality = {}
for country in countries:
    print(country.label.en)
    nat = query % ('wd:%s' % country.code)
    results = wiki.query(nat)
    for x in results["results"]["bindings"]:
        country.nationality = x['nat']['value']

China
India
United States
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Salvador
Nicaragua
Kyrgyzstan
Lebanon
Singapore
Turkmenistan
Denmark
Den

In [91]:
[(i, x.label.en) for i, x in enumerate(countries) if 'nationality' not in x]

[(20, 'United Kingdom'),
 (32, 'Ukraine'),
 (39, 'Afghanistan'),
 (40, 'Saudi Arabia'),
 (174, 'Orange Free State'),
 (184, 'Orange Free State')]

In [96]:
countries[20].nationality = 'English'
countries[32].nationality = 'Ukrainian'
countries[39].nationality = 'Afghan'
countries[40].nationality = 'Saudi'
countries[174].nationality = 'Orange'
countries[184].nationality = 'Orange'
countries[94].nationality = 'Israeli'

In [2]:
import pickle

In [250]:
pickle.dump(countries, open('/home/marcotcr/tmp/country_stuff.pkl', 'wb'))

First names

In [436]:
query = '''
SELECT ?name ?nameLabel ?count
WITH {
  SELECT ?name ?nameLabel (count(?person) AS ?count) WHERE {
    ?person wdt:P735 ?name .  
    ?person wdt:P27 %s . 
    ?person wdt:P21 %s .
  }
  GROUP BY ?name ?nameLabel
  ORDER BY DESC(?count)
  LIMIT 100
} AS %%results
WHERE {
  INCLUDE %%results
  SERVICE wikibase:label { bd:serviceParam wikibase:language "%s,en". }
}
ORDER BY DESC(?count)
'''
import collections
male_names = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
female_names = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
male_by_country = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
female_by_country= collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
for country in countries:
    print(country.label.en)
    print(country.primary_lang)
    male = query % ('wd:%s' % country.code, 'wd:Q6581097', country.primary_lang)
    results = wiki.query(male)
    for x in results["results"]["bindings"]:
#         print(x)
        male_names[country.primary_lang][x['nameLabel']['value']] += int(x['count']['value'])
        male_by_country[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])
    female = query % ('wd:%s' % country.code, 'wd:Q6581072', country.primary_lang)
    results = wiki.query(female)
    for x in results["results"]["bindings"]:
        female_names[country.primary_lang][x['nameLabel']['value']] += int(x['count']['value'])
        female_by_country[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])

China
zh
India
hi
United States
en
Indonesia
id
Brazil
pt
Pakistan
ur
Nigeria
en
Bangladesh
bn
Russia
ru
Mexico
es
Japan
ja
Ethiopia
am
Philippines
en
Egypt
ar
Vietnam
vi
Congo
fr
Germany
de
Turkey
tr
Iran
fa
France
fr
United Kingdom
en
Thailand
th
Italy
it
South Africa
en
Tanzania
en
Myanmar
my
South Korea
ko
Colombia
es
Kenya
en
Spain
es
Argentina
es
Uganda
en
Ukraine
uk
Algeria
ar
Sudan
en
Poland
pl
Iraq
ar
Canada
fr
Morocco
ar
Afghanistan
uz
Saudi Arabia
ar
Uzbekistan
uz
Malaysia
ms
Angola
pt
Mozambique
pt
Nepal
ne
Peru
es
Venezuela
es
Yemen
ar
Ghana
en
Madagascar
fr
North Korea
ko
Australia
en
Ivory Coast
fr
Cameroon
fr
Niger
fr
Sri Lanka
ta
Romania
ro
Burkina Faso
fr
Malawi
en
Mali
fr
Kazakhstan
ru
Syria
ar
Chile
es
Guatemala
es
the Netherlands
nl
Zambia
en
Ecuador
es
Zimbabwe
en
Cambodia
km
Senegal
fr
Chad
fr
Guinea
fr
South Sudan
en
Rwanda
fr
Tunisia
ar
Cuba
es
Belgium
fr
Benin
fr
Bolivia
es
Somalia
so
Haiti
fr
Burundi
fr
Greece
el
Czech Republic
cs
Portugal
pt
Jordan
ar
Domini

First names in english

In [27]:
query = '''
SELECT ?name ?nameLabel ?count
WITH {
  SELECT ?name ?nameLabel (count(?person) AS ?count) WHERE {
    ?person wdt:P735 ?name .  
    ?person wdt:P27 %s . 
    ?person wdt:P21 %s .
  }
  GROUP BY ?name ?nameLabel
  ORDER BY DESC(?count)
  LIMIT 100
} AS %%results
WHERE {
  INCLUDE %%results
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?count)
'''
import collections
male_by_country_e = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
female_by_country_e= collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
for country in countries:
    print(country.label.en)
    male = query % ('wd:%s' % country.code, 'wd:Q6581097')
    results = wiki.query(male)
    for x in results["results"]["bindings"]:
#         print(x)
        male_by_country_e[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])
    female = query % ('wd:%s' % country.code, 'wd:Q6581072')
    results = wiki.query(female)
    for x in results["results"]["bindings"]:
        female_by_country_e[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])

China
India
United States
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Salvador
Nicaragua
Kyrgyzstan
Lebanon
Singapore
Turkmenistan
Denmark
Den

Last names

In [435]:
query = '''
SELECT ?name ?nameLabel ?count
WITH {
  SELECT ?name ?nameLabel (count(?person) AS ?count) WHERE {
    ?person wdt:P734 ?name .  
    ?person wdt:P27 %s . 
  }
  GROUP BY ?name ?nameLabel
  ORDER BY DESC(?count)
  LIMIT 100
} AS %%results
WHERE {
  INCLUDE %%results
  SERVICE wikibase:label { bd:serviceParam wikibase:language "%s,en". }
}
ORDER BY DESC(?count)
'''
last_names = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
last_by_country = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
for country in countries:
    print(country.label.en)
    print(country.primary_lang)
    last = query % ('wd:%s' % country.code, country.primary_lang)
    results = wiki.query(last)
    for x in results["results"]["bindings"]:
#         print(x)
        last_names[country.primary_lang][x['nameLabel']['value']] += int(x['count']['value'])
        last_by_country[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])

China
zh
India
hi
United States
en
Indonesia
id
Brazil
pt
Pakistan
ur
Nigeria
en
Bangladesh
bn
Russia
ru
Mexico
es
Japan
ja
Ethiopia
am
Philippines
en
Egypt
ar
Vietnam
vi
Congo
fr
Germany
de
Turkey
tr
Iran
fa
France
fr
United Kingdom
en
Thailand
th
Italy
it
South Africa
en
Tanzania
en
Myanmar
my
South Korea
ko
Colombia
es
Kenya
en
Spain
es
Argentina
es
Uganda
en
Ukraine
uk
Algeria
ar
Sudan
en
Poland
pl
Iraq
ar
Canada
fr
Morocco
ar
Afghanistan
uz
Saudi Arabia
ar
Uzbekistan
uz
Malaysia
ms
Angola
pt
Mozambique
pt
Nepal
ne
Peru
es
Venezuela
es
Yemen
ar
Ghana
en
Madagascar
fr
North Korea
ko
Australia
en
Ivory Coast
fr
Cameroon
fr
Niger
fr
Sri Lanka
ta
Romania
ro
Burkina Faso
fr
Malawi
en
Mali
fr
Kazakhstan
ru
Syria
ar
Chile
es
Guatemala
es
the Netherlands
nl
Zambia
en
Ecuador
es
Zimbabwe
en
Cambodia
km
Senegal
fr
Chad
fr
Guinea
fr
South Sudan
en
Rwanda
fr
Tunisia
ar
Cuba
es
Belgium
fr
Benin
fr
Bolivia
es
Somalia
so
Haiti
fr
Burundi
fr
Greece
el
Czech Republic
cs
Portugal
pt
Jordan
ar
Domini

Last names in english

In [37]:
query = '''
SELECT ?name ?nameLabel ?count
WITH {
  SELECT ?name ?nameLabel (count(?person) AS ?count) WHERE {
    ?person wdt:P734 ?name .  
    ?person wdt:P27 %s . 
  }
  GROUP BY ?name ?nameLabel
  ORDER BY DESC(?count)
  LIMIT 100
} AS %%results
WHERE {
  INCLUDE %%results
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?count)
'''
last_by_country_e = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
for country in countries:
    print(country.label.en)
    last = query % ('wd:%s' % country.code)
    results = wiki.query(last)
    for x in results["results"]["bindings"]:
#         print(x)
        last_by_country_e[country.label.en.replace(' ', '_')][x['nameLabel']['value']] += int(x['count']['value'])

China
India
United States
Indonesia
Brazil
Pakistan
Nigeria
Bangladesh
Russia
Mexico
Japan
Ethiopia
Philippines
Egypt
Vietnam
Congo
Germany
Turkey
Iran
France
United Kingdom
Thailand
Italy
South Africa
Tanzania
Myanmar
South Korea
Colombia
Kenya
Spain
Argentina
Uganda
Ukraine
Algeria
Sudan
Poland
Iraq
Canada
Morocco
Afghanistan
Saudi Arabia
Uzbekistan
Malaysia
Angola
Mozambique
Nepal
Peru
Venezuela
Yemen
Ghana
Madagascar
North Korea
Australia
Ivory Coast
Cameroon
Niger
Sri Lanka
Romania
Burkina Faso
Malawi
Mali
Kazakhstan
Syria
Chile
Guatemala
the Netherlands
Zambia
Ecuador
Zimbabwe
Cambodia
Senegal
Chad
Guinea
South Sudan
Rwanda
Tunisia
Cuba
Belgium
Benin
Bolivia
Somalia
Haiti
Burundi
Greece
Czech Republic
Portugal
Jordan
Dominican Republic
Sweden
Azerbaijan
Hungary
Belarus
United Arab Emirates
Honduras
Israel
Tajikistan
Austria
Switzerland
Papua New Guinea
Togo
Sierra Leone
Serbia
Bulgaria
Laos
Paraguay
Libya
El Salvador
Nicaragua
Kyrgyzstan
Lebanon
Singapore
Turkmenistan
Denmark
Den

In [437]:
mnames = extract_roberta(male_names);
fnames = extract_roberta(female_names);
lnames = extract_roberta(last_names)


In [438]:
# male_by_country['Brazil']

In [38]:
def valid_name(name):
    if ('.' in name and len(name)<3) or any([x.isdigit() for x in name]):
        print('F: %s' % name)
        return False
    return True

In [440]:
for country in male_by_country:
    male_by_country[country] =  [a[0] for a in sorted(male_by_country[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for country in female_by_country:
    female_by_country[country] =  [a[0] for a in sorted(female_by_country[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for country in last_by_country:
    last_by_country[country] =  [a[0] for a in sorted(last_by_country[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]

F: K.
F: P.
F: S.
F: M.
F: A.
F: R.
F: V.
F: T.
F: N.
F: C.
F: B.
F: G.
F: D.
F: J.
F: H.
F: E.
F: O.
F: L.
F: U.
F: Y.
F: J.
F: A.
F: M.
F: A.
F: M.
F: Q28790124
F: A.
F: S.
F: B.
F: C.
F: A.
F: Q66937416
F: K.
F: Q76885428
F: Q66936752
F: Q66937269
F: Q57079574
F: R.
F: S.
F: Q89128080
F: A.
F: M.
F: Q60691248
F: H.
F: Q16628866
F: K.
F: S.
F: M.
F: V.
F: G.
F: P.
F: T.
F: A.
F: D.
F: B.
F: C.
F: E.
F: S.
F: M.
F: K.
F: A.
F: P.
F: C.
F: T.
F: D.
F: H.
F: V.
F: R.
F: J.
F: G.
F: W.
F: N.
F: E.
F: B.
F: U.
F: Y.
F: L.
F: I.
F: J.
F: Q65298329
F: Н.
F: S.
F: B.
F: S.
F: V.
F: K.
F: Q4247853
F: S.
F: E.
F: A.
F: Z.
F: K.
F: A.
F: Q45763546
F: Q65767684
F: Y.
F: E.
F: S.
F: J.
F: C.
F: R.
F: A.
F: E.
F: Q64785309
F: Q64785142
F: A.
F: S.
F: C.
F: B.
F: D.
F: M.
F: S.
F: P.
F: B.
F: K.
F: R.
F: T.
F: V.
F: A.
F: C.
F: N.
F: Q25863245
F: Q74550176
F: Q69838313
F: N.
F: Q63115200
F: S.
F: Q45321546
F: Q94579019
F: Q66361368
F: Q56425071
F: F.
F: Q58009691
F: Q70611432
F: Q50940212
F: Q74421

In [39]:
for country in male_by_country_e:
    male_by_country_e[country] =  [a[0] for a in sorted(male_by_country_e[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for country in female_by_country_e:
    female_by_country_e[country] =  [a[0] for a in sorted(female_by_country_e[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for country in last_by_country_e:
    last_by_country_e[country] =  [a[0] for a in sorted(last_by_country_e[country].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]


F: K.
F: S.
F: M.
F: P.
F: A.
F: R.
F: V.
F: T.
F: C.
F: N.
F: B.
F: G.
F: D.
F: J.
F: H.
F: E.
F: O.
F: L.
F: U.
F: Y.
F: J.
F: A.
F: M.
F: A.
F: M.
F: Q28790124
F: A.
F: M.
F: S.
F: B.
F: K.
F: J.
F: Q105688322
F: Q66936752
F: Q66937269
F: Q104597043
F: R.
F: Q89128080
F: S.
F: A.
F: Q60691248
F: Q22977313
F: Q28790124
F: Q22977313
F: H.
F: Q22977313
F: Q16628866
F: K.
F: S.
F: M.
F: V.
F: G.
F: P.
F: T.
F: A.
F: D.
F: B.
F: C.
F: Q22977313
F: Q28790124
F: H.
F: E.
F: Q69508563
F: Q69509056
F: Q69513144
F: Q69507241
F: Q69507811
F: Q69509665
F: Q69507851
F: Q69509707
F: Q69508592
F: Q69512349
F: Q69511774
F: Q69512359
F: Q69513494
F: Q69510784
F: Q69511120
F: Q69508466
F: Q69509301
F: Q69512835
F: Q69511013
F: Q69509840
F: Q69510545
F: Q91247505
F: Q65178181
F: S.
F: M.
F: K.
F: A.
F: P.
F: C.
F: T.
F: D.
F: H.
F: V.
F: R.
F: J.
F: G.
F: W.
F: N.
F: E.
F: B.
F: U.
F: Y.
F: L.
F: I.
F: Q65557977
F: Q65557984
F: Q22977313
F: Q98663505
F: Q55098138
F: Q98276047
F: Q101985549
F: J.
F: Q1

In [441]:
for lang in mnames:
    mnames[lang] =  [a[0] for a in sorted(mnames[lang].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for lang in fnames:
    fnames[lang] =  [a[0] for a in sorted(fnames[lang].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]
for lang in lnames:
    lnames[lang] =  [a[0] for a in sorted(lnames[lang].items(), key=lambda x:-x[1]) if valid_name(a[0])][:100]

F: J.
F: Y.
F: E.
F: D.
F: J.
F: S.
F: A.
F: E.
F: V.
F: K.
F: B.
F: C.
F: Q60691248
F: Z.
F: Q65767684
F: A.
F: S.
F: B.
F: C.
F: M.
F: R.
F: K.
F: P.
F: S.
F: M.
F: A.
F: R.
F: V.
F: T.
F: N.
F: C.
F: B.
F: G.
F: D.
F: J.
F: H.
F: E.
F: O.
F: L.
F: U.
F: Y.
F: A.
F: M.
F: Q45763546
F: K.
F: S.
F: M.
F: V.
F: G.
F: P.
F: T.
F: A.
F: D.
F: B.
F: C.
F: S.
F: E.
F: Q64785309
F: Q64785142
F: Q65298329
F: Н.
F: S.
F: M.
F: K.
F: A.
F: P.
F: C.
F: T.
F: D.
F: H.
F: V.
F: R.
F: J.
F: G.
F: W.
F: N.
F: E.
F: B.
F: U.
F: Y.
F: L.
F: I.
F: Q57079574
F: R.
F: S.
F: Q89128080
F: A.
F: M.
F: Q28790124
F: H.
F: Q16628866
F: A.
F: Q66937416
F: K.
F: Q76885428
F: Q66936752
F: Q66937269
F: Q94579019
F: Q47092720
F: Q51272256
F: Q88229179
F: J.
F: Q69886076
F: Q94368759
F: Z.
F: M.
F: Q64657910
F: C.
F: Q65189565
F: Q48800342
F: Q74550176
F: Q69838313
F: N.
F: Q63115200
F: S.
F: Q55979066
F: Q62972673
F: Q50940212
F: Q74421605
F: Q51545569
F: Q45321546
F: Q52328234
F: Q64692658
F: Q65767947
F: Q5531270

In [442]:
male_by_country = munch.Munch(dict(male_by_country))
female_by_country = munch.Munch(dict(female_by_country))
last_by_country = munch.Munch(dict(last_by_country))

In [42]:
male_by_country_e = munch.Munch(dict(male_by_country_e))
female_by_country_e = munch.Munch(dict(female_by_country_e))
last_by_country_e = munch.Munch(dict(last_by_country_e))

In [443]:
data = munch.Munch({'countries': countries,
                   'mnames': mnames,
                   'fnames': fnames,
                    'lnames': lnames,
                   'male_by_country': male_by_country,
                   'last_by_country': last_by_country,
                   'female_by_country': female_by_country
                   'male_by_country_english': male_by_country_e,
                   'last_by_country_english': last_by_country_e,
                   'female_by_country_english': female_by_country_e
                   })

In [104]:
import pickle
pickle.dump(data, open('/home/marcotcr/tmp/country_stuff.pkl', 'wb'))
pickle.dump(wiki, open('/home/marcotcr/tmp/wikidata_queries.pkl', 'wb'))

In [24]:
data = pickle.load(open('/home/marcotcr/tmp/country_stuff.pkl', 'rb'))
wiki = pickle.load(open('/home/marcotcr/tmp/wikidata_queries.pkl', 'rb'))

In [107]:
a = pickle.load(open('/home/marcotcr/work/checklist/checklist/data/wikidata.pkl', 'rb'))

In [115]:
for k in data.countries[0]:
    print(k, a.countries[0][k] == data.countries[0][k])

code True
label True
lang_codes True
primary_lang True
population True
cities True


KeyError: 'nationality'