### Подключаем пакет

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

## Группировка и агрегирование

In [2]:
#
# Исходные данные на странице
#
# https://openflights.org/data.html
# 
url_airport = 'https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat'
url_routers = 'https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat'

df_raw_airport = pd.read_csv(url_airport)
display(df_raw_airport.head(3))
df_raw_routers = pd.read_csv(url_routers)
display(df_raw_routers.head(3))

Unnamed: 0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081689834590001,145.391998291,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
0,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
1,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
2,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports


Unnamed: 0,2B,410,AER,2965,KZN,2990,Unnamed: 6,0,CR2
0,2B,410,ASF,2966,KZN,2990,,0,CR2
1,2B,410,ASF,2966,MRV,2962,,0,CR2
2,2B,410,CEK,2968,KZN,2990,,0,CR2


In [3]:
columns_airport = ['Airport ID', 'Name', 'City',
                   'Country', 'IATA', 'ICAO', 'Latitude', 'Longitude', 'Altitude', 'Timezone',
                   'DST', 'Tz database time zone', 'Type', 'Source']

columns_routers = ['Airline', 'Airline ID', 'Source airport', 'Source airport ID', 
                   'Destination airport', 'Destination airport ID', 'Codeshare', 'Stops', 'Equipment']

df_raw_airport = pd.read_csv(url_airport, names = columns_airport)
display(df_raw_airport.head(3))
df_raw_routers = pd.read_csv(url_routers, names = columns_routers)
display(df_raw_routers.head(3))

Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports


Unnamed: 0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2


In [4]:
df_raw_routers.groupby('Source airport').count()

Unnamed: 0_level_0,Airline,Airline ID,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
Source airport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAE,9,9,9,9,9,0,9,9
AAL,20,20,20,20,20,5,20,20
AAN,2,2,2,2,2,0,2,2
AAQ,3,3,3,3,3,0,3,3
AAR,8,8,8,8,8,4,8,8
...,...,...,...,...,...,...,...,...
ZUH,60,60,60,60,60,10,60,60
ZUM,2,2,2,2,2,0,2,2
ZVK,3,3,3,3,3,0,3,3
ZYI,15,15,15,15,15,1,15,15


In [5]:
df_raw_routers.groupby('Source airport').count()['Source airport ID'].sort_values(ascending=False)

Source airport
ATL    915
ORD    558
PEK    535
LHR    527
CDG    524
      ... 
FRD      1
RUA      1
FOE      1
RUS      1
ESD      1
Name: Source airport ID, Length: 3409, dtype: int64

In [6]:
airport_largest = df_raw_routers.groupby('Source airport').count()['Source airport ID'].nlargest(n=10)
airport_largest.name ='FLY'
airport_largest

Source airport
ATL    915
ORD    558
PEK    535
LHR    527
CDG    524
FRA    497
LAX    492
DFW    469
JFK    456
AMS    453
Name: FLY, dtype: int64

In [7]:
largest_airport = pd.DataFrame(airport_largest).merge(df_raw_airport[['Airport ID', 'IATA','Name', 'Country']], 
                    left_index=True, 
                    right_on = 'IATA').reset_index(drop=True)[['Airport ID','Name', 'Country','IATA','FLY']]
largest_airport

Unnamed: 0,Airport ID,Name,Country,IATA,FLY
0,3682,Hartsfield Jackson Atlanta International Airport,United States,ATL,915
1,3830,Chicago O'Hare International Airport,United States,ORD,558
2,3364,Beijing Capital International Airport,China,PEK,535
3,507,London Heathrow Airport,United Kingdom,LHR,527
4,1382,Charles de Gaulle International Airport,France,CDG,524
5,340,Frankfurt am Main Airport,Germany,FRA,497
6,3484,Los Angeles International Airport,United States,LAX,492
7,3670,Dallas Fort Worth International Airport,United States,DFW,469
8,3797,John F Kennedy International Airport,United States,JFK,456
9,580,Amsterdam Airport Schiphol,Netherlands,AMS,453


In [8]:
# pip install a-world-of-countries
import awoc


In [9]:
my_world = awoc.AWOC()
my_world.get_country_continent_name('Australia')


'Oceania'

In [26]:
df_raw_airport['Country'].map(lambda x: my_world.get_country_continent_name(x))

#### Ощибка вызвана тем, что есть страна которая корректно не обрабатывается. 
#### Пакет не знает срану "Cote D'ivoire" 

NameError: The specified Country Name "Cote D'ivoire" does not exist.

In [27]:
# Функция возвращающая NaN при обработке ошибки в имени страны

def country_to_continent(name):
    try:
        continent = my_world.get_country_continent_name(name)
    except NameError:
        continent = np.nan
    return continent
        
a = country_to_continent('RusSsia')
a


nan

In [12]:
df_airport = df_raw_airport.copy()
df_airport['Continent'] = df_airport['Country'].map(country_to_continent)
df_airport.head()

Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source,Continent
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports,Oceania
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports,Oceania
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports,Oceania
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports,Oceania
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports,Oceania


In [13]:
df_airport[df_airport['Continent'].isna()]

Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source,Continent
250,253,Port Bouet Airport,Abidjan,Cote d'Ivoire,ABJ,DIAP,5.261390,-3.926290,21,0,N,Africa/Abidjan,airport,OurAirports,
251,254,Bouaké Airport,Bouake,Cote d'Ivoire,BYK,DIBK,7.738800,-5.073670,1230,0,N,Africa/Abidjan,airport,OurAirports,
252,255,Daloa Airport,Daloa,Cote d'Ivoire,DJO,DIDL,6.792810,-6.473190,823,0,N,Africa/Abidjan,airport,OurAirports,
253,256,Korhogo Airport,Korhogo,Cote d'Ivoire,HGO,DIKO,9.387180,-5.556660,1214,0,N,Africa/Abidjan,airport,OurAirports,
254,257,Man Airport,Man,Cote d'Ivoire,MJC,DIMN,7.272070,-7.587360,1089,0,N,Africa/Abidjan,airport,OurAirports,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6926,11292,Kahemba Airport,Kahemba,Congo (Kinshasa),\N,FZCF,-7.333000,19.017000,3425,1,U,\N,airport,OurAirports,
7240,12003,Meiktila Air Base,Meiktila,Burma,\N,VYML,20.886400,95.892792,699,\N,\N,\N,airport,OurAirports,
7410,13212,Odienne Airport,Odienne,Cote d'Ivoire,KEO,DIOD,9.500000,-7.567000,1365,0,U,\N,airport,OurAirports,
7421,13260,Oyo Ollombo Airport,Oyo,Congo (Brazzaville),OLL,FCOD,-1.226666,15.910000,1073,1,N,\N,airport,OurAirports,


In [14]:
country_to_continent('Ivory Coast')

'Africa'

In [15]:
country_to_continent('Congo (Kinshasa)')

nan

In [16]:
country_to_continent("Congo (Brazzaville)")

nan

In [17]:
country_to_continent('Democratic Republic of the Congo')

'Africa'

In [18]:
country_to_continent('Republic of the Congo')

'Africa'

In [19]:
country_to_continent('Myanmar')

'Asia'

In [20]:
dict_country_error = {
    "Cote d'Ivoire": 'Ivory Coast',
    'Congo (Kinshasa)': 'Democratic Republic of the Congo',
    'Congo (Brazzaville)':'Republic of the Congo', 
    'Burma':'Myanmar',
    'Cocos (Keeling) Islands': 'Cocos Islands'
}

dict_country_error

{"Cote d'Ivoire": 'Ivory Coast',
 'Congo (Kinshasa)': 'Democratic Republic of the Congo',
 'Congo (Brazzaville)': 'Republic of the Congo',
 'Burma': 'Myanmar',
 'Cocos (Keeling) Islands': 'Cocos Islands'}

In [21]:
df_airport['Country'] = df_airport['Country'].replace(dict_country_error)
df_airport['Continent'] = df_airport['Country'].map(country_to_continent)

df_airport[df_airport['Continent'].isna()]

Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source,Continent
2087,2174,Jerusalem Airport,Jerusalem,West Bank,\N,OJJR,31.8647,35.2192,2485,2,U,Asia/Jerusalem,airport,OurAirports,
2157,2253,Henderson Field,Midway,Midway Islands,MDY,PMDY,28.2017,-177.380997,13,-11,U,Pacific/Midway,airport,OurAirports,
2637,2774,Cayenne-Rochambeau Airport,Cayenne,French Guiana,CAY,SOCA,4.81981,-52.360401,26,-3,U,America/Cayenne,airport,OurAirports,
2638,2775,Saint-Georges-de-l'Oyapock Airport,St.-georges Oyapock,French Guiana,OYP,SOOG,3.8976,-51.8041,46,-3,U,America/Cayenne,airport,OurAirports,
2734,2878,Martinique Aimé Césaire International Airport,Fort-de-france,Martinique,FDF,TFFF,14.591,-61.003201,16,-4,U,America/Martinique,airport,OurAirports,
2735,2879,L'Espérance Airport,St. Martin,Guadeloupe,SFG,TFFG,18.099899,-63.047199,7,-4,U,\N,airport,OurAirports,
2736,2881,Pointe-à-Pitre Le Raizet,Pointe-a-pitre,Guadeloupe,PTP,TFFR,16.265301,-61.531799,36,-4,U,America/Guadeloupe,airport,OurAirports,
2738,2883,Cyril E. King Airport,St. Thomas,Virgin Islands,STT,TIST,18.337299,-64.973396,23,-4,U,America/St_Thomas,airport,OurAirports,
2739,2884,Henry E Rohlsen Airport,St. Croix Island,Virgin Islands,STX,TISX,17.7019,-64.798599,74,-4,U,America/St_Thomas,airport,OurAirports,
3165,3359,Norfolk Island International Airport,Norfolk Island,Norfolk Island,NLK,YSNF,-29.041599,167.938995,371,11,U,Pacific/Norfolk,airport,OurAirports,


In [22]:
df_airport['Continent'].unique()

array(['Oceania', 'North America', 'Europe', 'Africa', 'South America',
       'Asia', 'Antarctica', nan], dtype=object)

In [23]:
df_airport[df_airport['Continent'].isna()]['Country'].unique()

array(['West Bank', 'Midway Islands', 'French Guiana', 'Martinique',
       'Guadeloupe', 'Virgin Islands', 'Norfolk Island', 'Johnston Atoll',
       'Svalbard', 'Wake Island'], dtype=object)

In [24]:
dict_teritor_error = {
    'West Bank': 'Asia', 
    'Midway Islands': 'Oceania', 
    'French Guiana': 'South America', 
    'Martinique': 'North America',
    'Guadeloupe': 'North America', 
    'Virgin Islands': 'North America', 
    'Norfolk Island': 'Oceania', 
    'Johnston Atoll': 'Oceania',
    'Svalbard': 'Europe', 
    'Wake Island': 'Oceania'}

In [25]:
df_airport['Continent'] = df_airport['Continent'].fillna( df_airport['Country'].map(dict_teritor_error) )