In [0]:
"""
Amadeus Challenge : Data Scientist Position
Author            : Angeloni Julien
Date Creation     : 10/08/2018

Second exercise - Top 10 arrival airports in the world in 2013

Data              : bookings.csv
"""

In [30]:
!pip install GeoBases3K



In [0]:
# LIBRARIES
import pandas as pd
from google.colab import drive
from GeoBases import GeoBase
import re

In [43]:
# IMPORT FILES
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [0]:
# GLOBAL VARIABLES
BOOKINGS = '/content/gdrive/My Drive/Colab Notebooks/bookings.csv'
GEO_O = GeoBase(data='ori_por', verbose=False)

# WORK USING FUNCTIONS (DEVELOPPED AFTER THE MAIN PART)

In [0]:
#METHOD PART

def get_df_cols(csvfilename,cols,separator):
  """
  Method to get a dataframe from a csv file with specified columns
  
  @csvfilename : the name of the file to convert in dataframe
  @cols        : list of string giving columns name to keep
  @separator   : character used to delimit fields in the csv file
  
  @return      : a dataframe
  """
  
  dataframe = pd.read_csv(BOOKINGS, error_bad_lines=False, encoding='UTF8', sep=separator, usecols=cols)
  
  return dataframe


def get_name(IATA_code):
  """
    Function to return the name of the airport linked to IATA_code
    
    @IATA_code : String object which is a IATA_code
   
    @return    : String object which is the name of the airport
  """
  
  #If IATE CODE exists in GEO_O
  try:
    result = GEO_O.get(IATA_code.replace(" ",""), 'name')
  #Else we just specify that we cannot found the IATA CODE
  except KeyError as e:
    result = "NOT FOUND IATA CODE"
   
  return result
  

def get_airports_arrival_sorted(dataframe):
  """
  Method to print the get arrivals airports in 2013 from searches file
  
  @dataframe : the dataframe containing the data
  
  @return    : a new dataframe
  """
  
  #Created dataframe grouped by 'arr_port' aggregated by sum
  result_dataframe = dataframe.groupby(['arr_port']).sum()
  #Sorted the result in a descending way
  result_dataframe = result_dataframe.sort_values(by=['pax'], ascending=False)
  
  return result_dataframe


def add_airports_name(dataframe):
  """
  Method to add a column in a dataframe containing the full name of airports
  thanks to the IATA CODE
  
  @dataframe : the dataframe to modify
  
  @return    : the dataframe modified
  """
  
  #Reset the index of the dataframe in order to apply a lambda method
  dataframe = dataframe.reset_index()
  
  #Add the column and its values
  dataframe['airport_name'] = dataframe['arr_port'].apply(lambda x: get_name(x))

  return dataframe

def print_top_n_arrival_airport(dataframe,n):
  """
  Method to print the top n of arrival airports in 2013
  
  @dataframe : the preformatted dataframe by columns containing the data
  @n         : the number of airports to show
  """
  
  df = get_airports_arrival_sorted(dataframe)
  df = add_airports_name(df)
  
  print(df.head(n))

In [46]:
#PRINT THE TOP TEN
cols = ["arr_port","pax"]

dataframe = get_df_cols(BOOKINGS,cols,'^')

print_top_n_arrival_airport(dataframe,10)

   arr_port      pax                          airport_name
0  LHR       88809.0               London Heathrow Airport
1  MCO       70930.0         Orlando International Airport
2  LAX       70530.0     Los Angeles International Airport
3  LAS       69630.0       Mc Carran International Airport
4  JFK       66270.0  John F Kennedy International Airport
5  CDG       64490.0             Paris - Charles-de-Gaulle
6  BKK       59460.0                          Suvarnabhumi
7  MIA       58150.0           Miami International Airport
8  SFO       58000.0   San Francisco International Airport
9  DXB       55590.0           Dubai International Airport


# MAIN PART

In [0]:
# DATAFRAME STRUCTURE
  BOOKINGS_DF_STRUCT = pd.read_csv(BOOKINGS, error_bad_lines=False, encoding='UTF8', sep='^', nrows=1)

# Dataframe using suggested columns
BOOKINGS_DF_EX = pd.read_csv(BOOKINGS, error_bad_lines=False, encoding='UTF8', sep='^', usecols=['arr_port','pax'])

In [7]:
BOOKINGS_DF_STRUCT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 38 columns):
act_date               1 non-null object
source                 1 non-null object
pos_ctry               1 non-null object
pos_iata               1 non-null object
pos_oid                1 non-null object
rloc                   1 non-null object
cre_date               1 non-null object
duration               1 non-null int64
distance               1 non-null int64
dep_port               1 non-null object
dep_city               1 non-null object
dep_ctry               1 non-null object
arr_port               1 non-null object
arr_city               1 non-null object
arr_ctry               1 non-null object
lst_port               1 non-null object
lst_city               1 non-null object
lst_ctry               1 non-null object
brd_port               1 non-null object
brd_city               1 non-null object
brd_ctry               1 non-null object
off_port               1 non-null object

In [8]:
# CHECKING SELECTED COLUMNS
BOOKINGS_DF_EX.head(10)

Unnamed: 0,arr_port,pax
0,LHR,-1.0
1,CLT,1.0
2,CLT,1.0
3,SVO,1.0
4,SVO,1.0
5,LGA,1.0
6,LGA,1.0
7,SIN,2.0
8,SIN,2.0
9,SIN,2.0


In [0]:
# CREATE DATAFRAME FROM BOOKINGS_DF_EX GROUPED BY arr_port
BOOKINGS_GROUP_BY_ARR_PORT = BOOKINGS_DF_EX.groupby(['arr_port']).sum()

In [10]:
# CHECKING RESULTS
BOOKINGS_GROUP_BY_ARR_PORT

Unnamed: 0_level_0,pax
arr_port,Unnamed: 1_level_1
AAB,30.0
AAE,810.0
AAL,810.0
AAN,20.0
AAQ,1650.0
AAR,500.0
ABA,90.0
ABB,40.0
ABE,880.0
ABI,150.0


In [11]:
# SORTED BY PAX
BOOKINGS_GROUP_BY_ARR_PORT.sort_values(by=['pax'], ascending=False)

Unnamed: 0_level_0,pax
arr_port,Unnamed: 1_level_1
LHR,88809.0
MCO,70930.0
LAX,70530.0
LAS,69630.0
JFK,66270.0
CDG,64490.0
BKK,59460.0
MIA,58150.0
SFO,58000.0
DXB,55590.0


In [12]:
# PRINT TOP 10 AIRPORT ARRIVALS
BOOKINGS_GROUP_BY_ARR_PORT.sort_values(by=['pax'], ascending=False).head(10)

Unnamed: 0_level_0,pax
arr_port,Unnamed: 1_level_1
LHR,88809.0
MCO,70930.0
LAX,70530.0
LAS,69630.0
JFK,66270.0
CDG,64490.0
BKK,59460.0
MIA,58150.0
SFO,58000.0
DXB,55590.0


## INSTALLATION OF GEOBASE

In [0]:
# Trying find airport with geobases

In [45]:
!pip install --upgrade setuptools

Collecting setuptools
[?25l  Downloading https://files.pythonhosted.org/packages/96/06/c8ee69628191285ddddffb277bd5abdf769166e7a14b867c2a172f0175b1/setuptools-40.4.3-py2.py3-none-any.whl (569kB)
[K    100% |████████████████████████████████| 573kB 5.6MB/s 
[31mtensorflow 1.11.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.[0m
[?25hInstalling collected packages: setuptools
  Found existing installation: setuptools 39.1.0
    Uninstalling setuptools-39.1.0:
      Successfully uninstalled setuptools-39.1.0
Successfully installed setuptools-40.4.3


In [47]:
!easy_install -U setuptools

Searching for setuptools
Reading https://pypi.org/simple/setuptools/
Downloading https://files.pythonhosted.org/packages/96/06/c8ee69628191285ddddffb277bd5abdf769166e7a14b867c2a172f0175b1/setuptools-40.4.3-py2.py3-none-any.whl#sha256=ce4137d58b444bac11a31d4e0c1805c69d89e8ed4e91fde1999674ecc2f6f9ff
Best match: setuptools 40.4.3
Processing setuptools-40.4.3-py2.py3-none-any.whl
Installing setuptools-40.4.3-py2.py3-none-any.whl to /usr/local/lib/python3.6/dist-packages
writing requirements to /usr/local/lib/python3.6/dist-packages/setuptools-40.4.3-py3.6.egg/EGG-INFO/requires.txt
Adding setuptools 40.4.3 to easy-install.pth file
Installing easy_install script to /usr/local/bin
Installing easy_install-3.6 script to /usr/local/bin

Installed /usr/local/lib/python3.6/dist-packages/setuptools-40.4.3-py3.6.egg
Processing dependencies for setuptools
Finished processing dependencies for setuptools


In [50]:
!easy_install -U NeoBase

Searching for NeoBase
Reading https://pypi.org/simple/NeoBase/
Downloading https://files.pythonhosted.org/packages/45/00/88a47029daf5e9306143ad08a319c87b618049d9f886a917a27ee7de38f1/NeoBase-0.18.2.tar.gz#sha256=c8460cd0be9dd38e0baf1972ea456ba66692bdf91d24950d675a33b069489332
Best match: NeoBase 0.18.2
Processing NeoBase-0.18.2.tar.gz
Writing /tmp/easy_install-kmg68dt9/NeoBase-0.18.2/setup.cfg
Running NeoBase-0.18.2/setup.py -q bdist_egg --dist-dir /tmp/easy_install-kmg68dt9/NeoBase-0.18.2/egg-dist-tmp-h6v1gjig
creating /usr/local/lib/python3.6/dist-packages/NeoBase-0.18.2-py3.6.egg
Extracting NeoBase-0.18.2-py3.6.egg to /usr/local/lib/python3.6/dist-packages
Adding NeoBase 0.18.2 to easy-install.pth file
Installing NeoBase script to /usr/local/bin

Installed /usr/local/lib/python3.6/dist-packages/NeoBase-0.18.2-py3.6.egg
Processing dependencies for NeoBase
Finished processing dependencies for NeoBase


In [53]:
!easy_install --user -U GeoBases3K


Searching for GeoBases3K
Reading https://pypi.org/simple/GeoBases3K/
Downloading https://files.pythonhosted.org/packages/fb/8a/d216ad4cbf6880fd03efa378f52384c8437cb823b9d754b1017eeda14d03/GeoBases3K-5.0.16.zip#sha256=b1362be47d9b99c2f34cb4a8c349b7cde9dc13a1960d0b0b42713a53a878a2ba
Best match: GeoBases3K 5.0.16
Processing GeoBases3K-5.0.16.zip
Writing /tmp/easy_install-l96v1pgt/GeoBases3K-5.0.16/setup.cfg
Running GeoBases3K-5.0.16/setup.py -q bdist_egg --dist-dir /tmp/easy_install-l96v1pgt/GeoBases3K-5.0.16/egg-dist-tmp-k2nmp_du
/!\ Installing without "['OpenTrepWrapper>=0.6']"
error: Setup script exited with error: SandboxViolation: mkdir('/root/.zsh', 511) {}

The package setup script has attempted to modify files on your system
that are not within the EasyInstall build area, and has been aborted.

This package cannot be safely installed by EasyInstall, and may not
support alternate installation locations even if you run its setup
script by hand.  Please inform the package's author an

In [1]:
!pip install GeoBases3K

Collecting GeoBases3K
[?25l  Downloading https://files.pythonhosted.org/packages/f6/aa/409422515f10dd9e036e57dfa1137439d760165d3a3c9539cc36c7d72030/GeoBases3K-5.0.16.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 790kB/s 
Collecting python_geohash (from GeoBases3K)
  Downloading https://files.pythonhosted.org/packages/9c/e2/1a3507af7c8f91f8a4975d651d4aeb6a846dfdf74713954186ade4205850/python-geohash-0.8.5.tar.gz
Collecting python_Levenshtein (from GeoBases3K)
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K    100% |████████████████████████████████| 51kB 18.8MB/s 
[?25hCollecting argparse (from GeoBases3K)
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Collecting colorama (from GeoBases3K)
  Downloading https://files.pythonhosted.org/

In [29]:
from GeoBases import GeoBase

#IMPORT DATA FROM GEOBASE
geo_o = GeoBase(data='ori_por', verbose=False)


def get_name(IATA_code):
  """
    Function to return the name of the airport linked to IATA_code
    
    @IATA_code : String object which is a IATA_code
   
    @return    : String object which is the name of the airport
  """
  try:
    result = geo_o.get(IATA_code.replace(" ",""), 'name')
  except KeyError as e:
    result = "NOT FOUND IATA CODE"
   
  return result
  


BOOKINGS_GROUP_BY_ARR_PORT.columns.values
new_df = BOOKINGS_GROUP_BY_ARR_PORT.reset_index()

new_df['airport_name'] = new_df['arr_port'].apply(lambda x: get_name(x))

new_df

Unnamed: 0,arr_port,pax,airport_name
0,AAB,30.0,Arrabury Airport
1,AAE,810.0,Rabah Bitat Annaba Airport
2,AAL,810.0,Aalborg Airport
3,AAN,20.0,Al Ain International Airport
4,AAQ,1650.0,Anapa Airport
5,AAR,500.0,Tirstrup Airport
6,ABA,90.0,Abakan International Airport
7,ABB,40.0,RAF Abingdon
8,ABE,880.0,Lehigh Valley International Airport
9,ABI,150.0,Abilene Regional Airport
