In [59]:
import pandas as pd
import numpy as np
from pathlib import Path

In [60]:
current_path = Path.cwd()
data_path = current_path.parent.joinpath('data')

# Countries

In [10]:
df_countries_raw = pd.read_csv(data_path.joinpath('df_countries.csv'), sep=';', header=0, na_values=["", np.nan], keep_default_na=False)
df_countries_raw.head()

Unnamed: 0,CountryCode,Names.Name.@LanguageCode,Names.Name.$
0,AD,EN,Andorra
1,AE,EN,United Arab Emirates
2,AF,EN,Afghanistan
3,AG,EN,"Antigua And Barbuda, Leeward Islands"
4,AI,EN,"Anguilla, Leeward Islands"


In [11]:
df_countries_raw.columns = ['CountryCode', 'LanguageCode', 'CountryName']
df_countries_raw.head()

Unnamed: 0,CountryCode,LanguageCode,CountryName
0,AD,EN,Andorra
1,AE,EN,United Arab Emirates
2,AF,EN,Afghanistan
3,AG,EN,"Antigua And Barbuda, Leeward Islands"
4,AI,EN,"Anguilla, Leeward Islands"


In [12]:
df_countries_clean = df_countries_raw[['CountryCode', 'CountryName']]
df_countries_clean.head()

Unnamed: 0,CountryCode,CountryName
0,AD,Andorra
1,AE,United Arab Emirates
2,AF,Afghanistan
3,AG,"Antigua And Barbuda, Leeward Islands"
4,AI,"Anguilla, Leeward Islands"


In [13]:
df_countries_clean.shape

(238, 2)

In [14]:
df_countries_clean.isna().sum()

CountryCode    0
CountryName    0
dtype: int64

In [15]:
df_countries_clean.loc[df_countries_clean['CountryCode'].isna()]

Unnamed: 0,CountryCode,CountryName


In [16]:
df_countries_clean.drop_duplicates(subset=["CountryCode"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_countries_clean.drop_duplicates(subset=["CountryCode"], inplace=True)


In [19]:
print("Shape df_countries_clean", df_countries_clean.shape)
df_countries_clean.to_csv(data_path.joinpath('df_countries_clean.csv'), sep=';', header=True, index=False)

Shape df_countries_clean (238, 2)


# Cities

In [61]:
df_cities_raw = pd.read_csv(data_path.joinpath('df_cities.csv'), sep=';', header=0, na_values=["", " ",np.nan], keep_default_na=False)
df_cities_raw.head()

Unnamed: 0,CityCode,CountryCode,UtcOffset,TimeZoneId,Names.Name.@LanguageCode,Names.Name.$,Airports.AirportCode
0,AAA,PF,-10:00,Pacific/Tahiti,EN,Anaa,AAA
1,AAB,AU,+10:00,Australia/Brisbane,EN,Arrabury,AAB
2,AAC,EG,+02:00,Africa/Cairo,EN,El Arish,AAC
3,AAD,SO,+03:00,Africa/Mogadishu,EN,Adado,AAD
4,AAE,DZ,+01:00,Africa/Algiers,EN,Annaba,AAE


In [62]:
df_cities_raw.columns = ['CityCode', 'CountryCode', 'UtcOffset', 'TimeZoneId','LanguageCode', 'CityName', 'AirportCode']
df_cities_raw.head()

Unnamed: 0,CityCode,CountryCode,UtcOffset,TimeZoneId,LanguageCode,CityName,AirportCode
0,AAA,PF,-10:00,Pacific/Tahiti,EN,Anaa,AAA
1,AAB,AU,+10:00,Australia/Brisbane,EN,Arrabury,AAB
2,AAC,EG,+02:00,Africa/Cairo,EN,El Arish,AAC
3,AAD,SO,+03:00,Africa/Mogadishu,EN,Adado,AAD
4,AAE,DZ,+01:00,Africa/Algiers,EN,Annaba,AAE


In [63]:
df_cities_clean = df_cities_raw[['CityCode', 'CountryCode', 'UtcOffset', 'TimeZoneId', 'CityName']]
df_cities_clean.head()

Unnamed: 0,CityCode,CountryCode,UtcOffset,TimeZoneId,CityName
0,AAA,PF,-10:00,Pacific/Tahiti,Anaa
1,AAB,AU,+10:00,Australia/Brisbane,Arrabury
2,AAC,EG,+02:00,Africa/Cairo,El Arish
3,AAD,SO,+03:00,Africa/Mogadishu,Adado
4,AAE,DZ,+01:00,Africa/Algiers,Annaba


In [64]:
df_cities_clean.isna().sum()

CityCode       0
CountryCode    0
UtcOffset      0
TimeZoneId     0
CityName       0
dtype: int64

In [65]:
df_cities_clean.loc[df_cities_clean['CountryCode'].isna()]

Unnamed: 0,CityCode,CountryCode,UtcOffset,TimeZoneId,CityName


In [66]:
df_cities_clean.loc[df_cities_clean.duplicated()]

Unnamed: 0,CityCode,CountryCode,UtcOffset,TimeZoneId,CityName


In [67]:
print("Shape df_cities_clean", df_cities_clean.shape)
df_cities_clean.to_csv(data_path.joinpath('df_cities_clean.csv'), sep=';', header=True, index=False)

Shape df_cities_clean (10694, 5)


# Airports

In [33]:
df_airports_raw = pd.read_csv(data_path.joinpath('df_airports.csv'), sep=';', header=0, na_values=["",np.nan], keep_default_na=False)
df_airports_raw.head()

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Position.Coordinate.Latitude,Position.Coordinate.Longitude,Names.Name.@LanguageCode,Names.Name.$
0,AAA,AAA,PF,Airport,-10:00,Pacific/Tahiti,-17.3525,-145.51,EN,Anaa
1,AAB,AAB,AU,Airport,+10:00,Australia/Brisbane,-26.6911,141.0472,EN,Arrabury
2,AAC,AAC,EG,Airport,+02:00,Africa/Cairo,31.0733,33.8358,EN,El Arish International
3,AAD,AAD,SO,Airport,+03:00,Africa/Mogadishu,6.0961,46.6375,EN,Adado
4,AAE,AAE,DZ,Airport,+01:00,Africa/Algiers,36.8222,7.8092,EN,Annaba Rabah Bitat


In [34]:
df_airports_raw.columns = ['AirportCode', 'CityCode', 'CountryCode', 'LocationType', 'UtcOffset', 'TimeZoneId', 'Latitude', 'Longitude', 'LanguageCode', 'AirportName']
df_airports_raw.head()

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Latitude,Longitude,LanguageCode,AirportName
0,AAA,AAA,PF,Airport,-10:00,Pacific/Tahiti,-17.3525,-145.51,EN,Anaa
1,AAB,AAB,AU,Airport,+10:00,Australia/Brisbane,-26.6911,141.0472,EN,Arrabury
2,AAC,AAC,EG,Airport,+02:00,Africa/Cairo,31.0733,33.8358,EN,El Arish International
3,AAD,AAD,SO,Airport,+03:00,Africa/Mogadishu,6.0961,46.6375,EN,Adado
4,AAE,AAE,DZ,Airport,+01:00,Africa/Algiers,36.8222,7.8092,EN,Annaba Rabah Bitat


In [35]:
df_airports_clean = df_airports_raw[['AirportCode', 'CityCode', 'CountryCode', 'LocationType', 'UtcOffset', 'TimeZoneId', 'Latitude', 'Longitude', 'AirportName']]
df_airports_clean.head()

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Latitude,Longitude,AirportName
0,AAA,AAA,PF,Airport,-10:00,Pacific/Tahiti,-17.3525,-145.51,Anaa
1,AAB,AAB,AU,Airport,+10:00,Australia/Brisbane,-26.6911,141.0472,Arrabury
2,AAC,AAC,EG,Airport,+02:00,Africa/Cairo,31.0733,33.8358,El Arish International
3,AAD,AAD,SO,Airport,+03:00,Africa/Mogadishu,6.0961,46.6375,Adado
4,AAE,AAE,DZ,Airport,+01:00,Africa/Algiers,36.8222,7.8092,Annaba Rabah Bitat


In [36]:
df_airports_clean.isna().sum()

AirportCode     0
CityCode        0
CountryCode     0
LocationType    0
UtcOffset       0
TimeZoneId      0
Latitude        0
Longitude       0
AirportName     2
dtype: int64

In [37]:
df_airports_clean.loc[df_airports_clean['AirportName'].isna()]

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Latitude,Longitude,AirportName
2966,GOM,GOM,CD,Airport,+02:00,Africa/Lubumbashi,-1.6708,29.2383,
2987,GPS,GPS,EC,Airport,-06:00,Pacific/Galapagos,-0.4539,-90.2658,


In [38]:
df_airports_clean.loc[df_airports_clean.duplicated()]

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Latitude,Longitude,AirportName


In [39]:
df_airports_clean.drop_duplicates(subset=["AirportCode"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airports_clean.drop_duplicates(subset=["AirportCode"], inplace=True)


In [40]:
print("Shape df_airports_clean", df_airports_clean.shape)
df_airports_clean.to_csv(data_path.joinpath('df_airports_clean.csv'), sep=';', header=True, index=False)

Shape df_airports_clean (11676, 9)


# Airlines

In [41]:
df_airlines_raw = pd.read_csv(data_path.joinpath('df_airlines.csv'), sep=';', header=0, na_values=["",np.nan], keep_default_na=False)
df_airlines_raw.head()

Unnamed: 0,AirlineID,AirlineID_ICAO,Names.Name.@LanguageCode,Names.Name.$
0,0A,GNT,EN,Amber Air
1,0B,BMS,EN,Blue Air
2,0D,DWT,EN,Darwin Airline Sa
3,0J,PJZ,EN,Premium Jet Ag
4,0K,KRT,EN,Aircompany Kokshetau


In [42]:
df_airlines_raw.columns = ['AirlineID', 'AirlineID_ICAO', 'LanguageCode', 'AirlineName']
df_airlines_raw.head()

Unnamed: 0,AirlineID,AirlineID_ICAO,LanguageCode,AirlineName
0,0A,GNT,EN,Amber Air
1,0B,BMS,EN,Blue Air
2,0D,DWT,EN,Darwin Airline Sa
3,0J,PJZ,EN,Premium Jet Ag
4,0K,KRT,EN,Aircompany Kokshetau


In [43]:
df_airlines_clean = df_airlines_raw[['AirlineID', 'AirlineID_ICAO', 'AirlineName']]
df_airlines_clean.head()

Unnamed: 0,AirlineID,AirlineID_ICAO,AirlineName
0,0A,GNT,Amber Air
1,0B,BMS,Blue Air
2,0D,DWT,Darwin Airline Sa
3,0J,PJZ,Premium Jet Ag
4,0K,KRT,Aircompany Kokshetau


In [46]:
df_airlines_clean.isna().sum()

AirlineID           0
AirlineID_ICAO    122
AirlineName         0
dtype: int64

In [45]:
df_airlines_clean.loc[df_airlines_clean['AirlineID'].isna()]

Unnamed: 0,AirlineID,AirlineID_ICAO,AirlineName


In [47]:
df_airlines_clean.loc[df_airlines_clean.duplicated()]

Unnamed: 0,AirlineID,AirlineID_ICAO,AirlineName


In [48]:
df_airlines_clean.drop_duplicates(subset=['AirlineID'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_airlines_clean.drop_duplicates(subset=['AirlineID'], inplace=True)


In [49]:
print("Shape df_airlines_clean", df_airlines_clean.shape)
df_airlines_clean.to_csv(data_path.joinpath('df_airlines_clean.csv'), sep=';', header=True, index=False)

Shape df_airlines_clean (1127, 3)


# Aircrafts

In [50]:
df_aircrafts_raw = pd.read_csv(data_path.joinpath('df_aircrafts.csv'), sep=';', header=0, na_values=["",np.nan], keep_default_na=False)
df_aircrafts_raw.head()

Unnamed: 0,AircraftCode,AirlineEquipCode,Names.Name.@LanguageCode,Names.Name.$
0,100,F100,EN,Fokker 100
1,141,B461,EN,BAE Systems 146-100 Passenger
2,142,B462,EN,BAE Systems 146-200 Passenger
3,143,B463,EN,BAE Systems 146-300 Passenger
4,14X,B461,EN,BAE Systems 146-100 Freighter


In [51]:
df_aircrafts_raw.columns = ['AircraftCode', 'AirlineEquipCode', 'LanguageCode', 'AircraftName']
df_aircrafts_raw.head()

Unnamed: 0,AircraftCode,AirlineEquipCode,LanguageCode,AircraftName
0,100,F100,EN,Fokker 100
1,141,B461,EN,BAE Systems 146-100 Passenger
2,142,B462,EN,BAE Systems 146-200 Passenger
3,143,B463,EN,BAE Systems 146-300 Passenger
4,14X,B461,EN,BAE Systems 146-100 Freighter


In [52]:
df_aircrafts_clean = df_aircrafts_raw[['AircraftCode', 'AirlineEquipCode', 'AircraftName']]
df_aircrafts_clean.head()

Unnamed: 0,AircraftCode,AirlineEquipCode,AircraftName
0,100,F100,Fokker 100
1,141,B461,BAE Systems 146-100 Passenger
2,142,B462,BAE Systems 146-200 Passenger
3,143,B463,BAE Systems 146-300 Passenger
4,14X,B461,BAE Systems 146-100 Freighter


In [53]:
df_aircrafts_clean.isna().sum()

AircraftCode        0
AirlineEquipCode    1
AircraftName        1
dtype: int64

In [54]:
df_aircrafts_clean.loc[df_aircrafts_clean.duplicated()]

Unnamed: 0,AircraftCode,AirlineEquipCode,AircraftName


In [83]:
df_aircrafts_clean.drop_duplicates(subset=['AircraftCode'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aircrafts_clean.drop_duplicates(subset=['AircraftCode'], inplace=True)


In [55]:
print("Shape of df_aircrafts_clean", df_aircrafts_clean.shape)
df_aircrafts_clean.to_csv(data_path.joinpath('df_aircrafts_clean.csv'), sep=';', header=True, index=False)

Shape of df_aircrafts_clean (380, 3)


# Value Counts

In [56]:
lst_df = [df_aircrafts_clean, df_airlines_clean, df_airports_clean, df_cities_clean, df_countries_clean]

In [57]:
for item in lst_df:
    display(item.value_counts())
    print("\n\n")

AircraftCode  AirlineEquipCode  AircraftName                                
100           F100              Fokker 100                                      1
DHR           DH2T              De Havilland (Bombardier) DHC-2 Turbo Beaver    1
EC5           EC55              Eurocopter EC155                                1
EC3           EC30              Eurocopter EC130                                1
EA5           EA50              Eclipse 500                                     1
                                                                               ..
A81           A148              Antonov AN148-100                               1
A5F           A225              Antonov An-225                                  1
A58           ZZZZ              Antonov An-158                                  1
A4F           A124              Antonov An-124 Ruslan                           1
YS1           YS11              NAMC YS-11                                      1
Length: 379, dtype: i






AirlineID  AirlineID_ICAO  AirlineName                   
0A         GNT             Amber Air                         1
PB         SPR             Pal Airlines                      1
OX         OEA             Orient Thai Airlines              1
OY         OAE             Omni Air International            1
OZ         AAR             Asiana Airlines                   1
                                                            ..
EW         EWG             Eurowings                         1
EX         SDO             Aerolineas Santo Domingo_ S.A.    1
EXN        EXN             Exin                              1
EXT        EXT             Night Express_ Frankfurt          1
ZY         CCO             China Air Cargo                   1
Length: 1005, dtype: int64






AirportCode  CityCode  CountryCode  LocationType  UtcOffset  TimeZoneId           Latitude  Longitude  AirportName                   
AAA          AAA       PF           Airport       -10:00     Pacific/Tahiti       -17.3525  -145.5100  Anaa                              1
RKO          RKO       ID           Airport       +07:00     Asia/Pontianak       -2.0833    99.6833   Sipora                            1
RKR          RKR       US           Airport       -06:00     America/Chicago       35.0500  -94.6167   Poteau Robert S Kerr              1
RKS          RKS       US           Airport       -07:00     America/Denver        41.5947  -109.0656  Rock Springs                      1
RKT          RKT       AE           Airport       +04:00     Asia/Dubai            25.6136   55.9389   Ras Al Khaimah International      1
                                                                                                                                        ..
JGX          JGX       US       






CityCode  CountryCode  UtcOffset  TimeZoneId            CityName   
AAA       PF           -10:00     Pacific/Tahiti        Anaa           1
RAS       IR           +03:30     Asia/Tehran           Rasht          1
RAU       BD           +06:00     Asia/Dhaka            Rangpur        1
RAV       CO           -05:00     America/Bogota        Cravo Norte    1
RAW       PG           +11:00     Pacific/Bougainville  Arawa          1
                                                                      ..
JGS       CN           +08:00     Asia/Shanghai         Ji'an          1
JGX       US           -08:00     America/Los_Angeles   Glendale       1
JHB       MY           +08:00     Asia/Kuching          Johor Bahru    1
JHC       US           -05:00     America/New_York      Garden City    1
ZZU       MW           +02:00     Africa/Blantyre       Mzuzu          1
Length: 10693, dtype: int64






CountryCode  CountryName      
AD           Andorra              1
PA           Panama               1
NC           New Caledonia        1
NE           Niger                1
NF           Norfolk Island       1
                                 ..
GQ           Equatorial Guinea    1
GR           Greece               1
GT           Guatemala            1
GU           Guam                 1
ZW           Zimbabwe             1
Length: 238, dtype: int64






In [58]:
df_airports_clean[df_airports_clean["CityCode"] == "BER"]

Unnamed: 0,AirportCode,CityCode,CountryCode,LocationType,UtcOffset,TimeZoneId,Latitude,Longitude,AirportName
686,BER,BER,DE,Airport,+01:00,Europe/Berlin,52.3667,13.5033,Berlin/Brandenburg
3098,GWW,BER,DE,Airport,+01:00,Europe/Berlin,52.4833,13.1333,Berlin Royal Air Force Gatow
7376,QPP,BER,DE,Rail Station,+01:00,Europe/Berlin,52.525,13.3694,Berlin Hbf Rail Station
7519,QWB,BER,DE,Rail Station,+01:00,Europe/Berlin,52.5167,13.4167,Berlin Ostbhf Rail Station
7520,QWC,BER,DE,Rail Station,+01:00,Europe/Berlin,52.5081,13.3314,Berlin Zoo Rail Station
7522,QWE,BER,DE,Rail Station,+01:00,Europe/Berlin,53.0,14.0,Berlin Friedrichstrasse Rail Station
8575,SXF,BER,DE,Airport,+01:00,Europe/Berlin,52.3786,13.5206,Berlin/Schoenefeld
8811,THF,BER,DE,Airport,+01:00,Europe/Berlin,52.4736,13.4017,Berlin Tempelhof Apt
9161,TXL,BER,DE,Airport,+01:00,Europe/Berlin,52.5597,13.2878,Berlin/Tegel
