In [1]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from time import sleep
from random import randint
import pandas as pd
from collections import Counter
import numpy

solved_queries_places = {}
unsolved_queries_places = []
solved_queries_lat_lon = {}

geolocator = Nominatim(user_agent="matemmatem2000@gmail.com")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [4]:
def get_raw_address(ad):
    print("\t part:", ad)
    sleep(randint(1*100,2*100)/100)
    location = geocode(ad)
    if location is not None:
        solved_queries_places[ad] = location.raw
        return location.raw
    else:
        unsolved_queries_places.append(ad)

def get_addresses_list(add):
    if type(add) == numpy.ndarray:
        add = ",".join(add)
    print("   place: ",add)
    found_addresses = []
    for ad in set(add.split(',')):
        if ad in solved_queries_places:
            found_addresses.append(solved_queries_places[ad])
        elif ad not in unsolved_queries_places:
            found_addresses += [loc] if (loc:=get_raw_address(ad)) is not None else [] 
    return found_addresses

def get_countries_list(found_addresses):
    countries = []
    for address in found_addresses:
        latid, longit = address['lat'], address['lon']
        if (latid, longit) in solved_queries_lat_lon:
            countries.append(solved_queries_lat_lon[(latid, longit)])
        elif (latid, longit) not in solved_queries_lat_lon:
            sleep(randint(1*100,2*100)/100)
            location = geolocator.reverse(latid + "," + longit, language='en')
            if location is not None:
                new_country = location.raw['address']['country']
                countries.append(new_country)
                solved_queries_lat_lon[(latid, longit)] = new_country
            else:
                countries.append(None)
    return countries

def choose_country(txt):
    addresses_list = get_addresses_list(txt)
    if (countries:=get_countries_list(addresses_list)) is None or countries == []:
        selected_country = "FIXME"
    else:
        selected_country = Counter(countries).most_common(1)[0][0]
    return selected_country

def find_countries_by_university(df):
    found_countries = []
    for id, row in df.iterrows():
        print("Starting: ", id)
        selected_country = choose_country(row['Where'])
        found_countries.append(selected_country)
        print("selected country: ",selected_country)
    df['Countries'] = found_countries
    return df

def fix_missing_countries_by_address(df):
    df_missing = df[df['Countries'] == 'FIXME'][['Where','Address']]
    for uni in df_missing['Where'].unique():
        print("fixing: ",uni)
        addresses_of_this_uni = df_missing[df_missing['Where'] == uni]['Address'].unique()
        selected_country = choose_country(addresses_of_this_uni)
        print("\t\t selected country: ",selected_country)
        df.loc[df['Where'] == uni, 'Countries'] = selected_country
    return df

def add_countries_to_df(df):
    df = find_countries_by_university(df)
    df = fix_missing_countries_by_address(df)
    return df

In [5]:
df = pd.read_csv("results_2022_1.csv",index_col=0)
df = add_countries_to_df(df)

Starting:  1
   place:  Mines Paris - PSL
	 part: Mines Paris - PSL
selected country:  FIXME
Starting:  2
   place:  University Politehnica of Bucarest
	 part: University Politehnica of Bucarest
selected country:  Romania
Starting:  3
   place:  University Politehnica of Bucarest
selected country:  Romania
Starting:  4
   place:  Czech Technical University in Prague
	 part: Czech Technical University in Prague
selected country:  Czechia
Starting:  5
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  6
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  7
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  8
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  9
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  10
   place:  Czech Technical University in Prague
selected country:  Czechia
Starting:  11
   place:  Czech

In [6]:
df.to_csv("results_2022_1_all.csv")