In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import sparse
from scipy import linalg
import scipy.sparse.linalg
from sklearn.cluster import KMeans

In [3]:
routes = pd.read_csv('data/routes.dat', sep=',', header = None, encoding='utf-8')
routes.columns = ['Airline','AirlineID','SourceAirport','SourceAirportID','DestinationAirport','DestinationAirportID','Codeshare','Stops','Equipment']
routes = routes.drop(columns=['AirlineID','SourceAirportID','DestinationAirportID','Stops','Equipment','Codeshare'])
print(routes.head())
print(routes.duplicated().any())

  Airline SourceAirport DestinationAirport
0      2B           AER                KZN
1      2B           ASF                KZN
2      2B           ASF                MRV
3      2B           CEK                KZN
4      2B           CEK                OVB
False


In [4]:
alliances = pd.read_csv('data/alliances.dat', sep=',', header = None, encoding='utf-8')
alliances.columns = ['Alliance','IATA','Region']
print(alliances.head())
print(alliances.duplicated().any())

        Alliance IATA        Region
0  Star Alliance   JP        Europe
1  Star Alliance   A3        Europe
2  Star Alliance   AC  NorthAmerica
3  Star Alliance   CA          Asia
4  Star Alliance   NZ          Asia
False


In [5]:
airlines = pd.read_csv('data/airlines.dat', sep=',', header = None, encoding='utf-8')
airlines.columns = ['Airline ID', 'Name', 'Alias',  'IATA', 'ICAO','Callsign','Country','Active']
airlines = airlines.drop(columns=['Airline ID','Alias','ICAO','Callsign','Active','Country'])
airlines = airlines[~airlines.IATA.isnull()]
airlines = airlines[airlines.IATA != '-']
airlines = airlines[~airlines.Name.isnull()]
airlines = airlines.drop_duplicates()
airlines = airlines.drop_duplicates('IATA')
print(airlines.head())
print(airlines.duplicated(['IATA']).any())

airlineID = routes[['Airline']].rename(columns={'Airline':'IATA'})
airlineID = airlineID.drop_duplicates().reset_index().drop(columns=['index'])
print(airlineID.head())
print(airlineID.duplicated().any())

airlineID = pd.merge(airlineID,alliances,left_on='IATA',right_on='IATA',how='right')
airlineID = pd.merge(airlineID,airlines,left_on='IATA',right_on='IATA',how='left')
airlineID = airlineID.reset_index().rename(columns={'index':'airlineID'})
print(airlineID.head())
print(airlineID.duplicated().any())

                    Name IATA
3          1Time Airline   1T
10           40-Mile Air   Q5
13      Ansett Australia   AN
14  Abacus International   1B
15       Abelag Aviation   W9
False
  IATA
0   2B
1   2G
2   2I
3   2J
4   2K
False
   airlineID IATA       Alliance        Region               Name
0          0   A3  Star Alliance        Europe    Aegean Airlines
1          1   AA      One World  NorthAmerica  American Airlines
2          2   AB      One World        Europe         Air Berlin
3          3   AC  Star Alliance  NorthAmerica         Air Canada
4          4   AF        SkyTeam        Europe         Air France
False


In [6]:
routesID = pd.merge(routes,airlineID,left_on='Airline',right_on='IATA',how='right')

In [7]:
source_airports = routesID[['SourceAirport']]
source_airports = source_airports.rename(columns={'SourceAirport':'Airport'})

dest_airports = routesID[['DestinationAirport']]
dest_airports = dest_airports.rename(columns={'DestinationAirport':'Airport'})

airports = pd.concat([source_airports,dest_airports]).drop_duplicates().reset_index().drop(columns=['index']).reset_index()
airports = airports.set_index('Airport').rename(columns={'index':'airportsID'})
print(airports.head())
print(airports.duplicated().any())

         airportsID
Airport            
AOK               0
ARN               1
ATH               2
AUH               3
AXD               4
False


In [8]:
routesID = pd.merge(routesID,airports,left_on='SourceAirport',right_on='Airport',how='left')
routesID = routesID.rename(columns={'airportsID':'SourceAirportID'})
routesID = pd.merge(routesID,airports,left_on='DestinationAirport',right_on='Airport',how='left')
routesID = routesID.rename(columns={'airportsID':'DestinationAirportID'})
print(routesID.head())

  Airline SourceAirport DestinationAirport  airlineID IATA       Alliance  \
0      A3           AOK                ATH          0   A3  Star Alliance   
1      A3           AOK                KSJ          0   A3  Star Alliance   
2      A3           AOK                RHO          0   A3  Star Alliance   
3      A3           ARN                KLX          0   A3  Star Alliance   
4      A3           ARN                SKG          0   A3  Star Alliance   

   Region             Name  SourceAirportID  DestinationAirportID  
0  Europe  Aegean Airlines                0                     2  
1  Europe  Aegean Airlines                0                    40  
2  Europe  Aegean Airlines                0                    63  
3  Europe  Aegean Airlines                1                    39  
4  Europe  Aegean Airlines                1                    64  


In [10]:
connections = routesID
connections = connections.drop(columns=['Airline','SourceAirport','DestinationAirport'])
connections = pd.merge(connections,connections,left_on='DestinationAirportID',right_on='SourceAirportID',how='inner')
connections = connections[connections.airlineID_x != connections.airlineID_y]
print(connections.head())

    airlineID_x IATA_x     Alliance_x Region_x           Name_x  \
65            0     A3  Star Alliance   Europe  Aegean Airlines   
66            0     A3  Star Alliance   Europe  Aegean Airlines   
67            0     A3  Star Alliance   Europe  Aegean Airlines   
68            0     A3  Star Alliance   Europe  Aegean Airlines   
69            0     A3  Star Alliance   Europe  Aegean Airlines   

    SourceAirportID_x  DestinationAirportID_x  airlineID_y IATA_y Alliance_y  \
65                  0                       2            1     AA  One World   
66                  0                       2            1     AA  One World   
67                  0                       2            4     AF    SkyTeam   
68                  0                       2            4     AF    SkyTeam   
69                  0                       2            4     AF    SkyTeam   

        Region_y             Name_y  SourceAirportID_y  DestinationAirportID_y  
65  NorthAmerica  American Airlines

In [11]:
grouped = connections[['airlineID_x','airlineID_y']].groupby(['airlineID_x','airlineID_y'])
group_sizes = grouped.size()
n_airlines = len(airlineID)
adjacency_airlines = np.zeros((n_airlines,n_airlines))
for name,group in grouped:
    adjacency_airlines[name[0],name[1]] += group_sizes.loc[name[0],name[1]]
    adjacency_airlines[name[1],name[0]] += group_sizes.loc[name[0],name[1]]

for i in range(n_airlines):
    for j in range(n_airlines):
        if airlineID.loc[i].Region == airlineID.loc[j].Region:
            adjacency_airlines[i,j] = 0

In [13]:
adjacency = np.copy(adjacency_airlines)
for i in range(n_airlines):
    adjacency[i] = adjacency[i]/np.sum(adjacency[i])
for i in range(n_airlines):
    for j in range(n_airlines):
        adjacency[i,j] = max(adjacency[i,j],adjacency[j,i])
        adjacency[j,i] = adjacency[i,j]

In [14]:
degrees = np.sum(adjacency, axis = 0)
degree_matrix = np.diag(degrees)
laplacian_combinatorial =  degree_matrix - adjacency;
sqrt_inv_degree_matrix = np.diag(np.sqrt(1/degrees))
laplacian_normalized = np.dot(np.dot(sqrt_inv_degree_matrix,laplacian_combinatorial),sqrt_inv_degree_matrix)

In [15]:
[eigenvalues, eigenvectors] = np.linalg.eig(laplacian_normalized)
sortID = np.argsort(eigenvalues)
eigenvalues = eigenvalues[sortID]

eigenvectors = eigenvectors[:,sortID]
print(eigenvalues)

[0.         0.6270841  0.70021003 0.73832356 0.76480485 0.81365115
 0.84317644 0.86224283 0.87383038 0.88286791 0.90362832 0.91246711
 0.91860535 0.92866357 0.9343869  0.94483274 0.94741903 0.95529884
 0.96186223 0.96820074 0.97023991 0.97502156 0.97980128 0.98744529
 0.98788189 0.99478748 0.99594489 0.99647469 0.99801135 1.00022948
 1.00536154 1.0068493  1.00801192 1.01138557 1.01386637 1.0205817
 1.02694718 1.03132827 1.03457913 1.03571588 1.04231021 1.04781818
 1.05381389 1.05980289 1.06473927 1.07671963 1.08398522 1.09210787
 1.09860066 1.11197817 1.12945366 1.13896309 1.15439311 1.17363824
 1.19445463 1.20579017 1.23735574 1.31425576 1.5204135  1.63738537]


In [21]:
k = 3; d = 3
H = eigenvectors[:,:d]; 
clusters3 = KMeans(n_clusters=k, random_state=0).fit_predict(H)

print("----- For k=",k," and d=",d," -----")
print("Number of elements in clusters :")
for i in range(k):
    cnt = 0
    for j in clusters3:
        if j == i:
            cnt +=1
    print("Cluster ",i+1,":",cnt)

----- For k= 3  and d= 3  -----
Number of elements in clusters :
Cluster  1 : 24
Cluster  2 : 24
Cluster  3 : 12


In [22]:
print(airlineID[clusters3 == 0][['IATA','Alliance','Name']])
print(airlineID[clusters3 == 1][['IATA','Alliance','Name']])
print(airlineID[clusters3 == 2][['IATA','Alliance','Name']])

   IATA       Alliance                          Name
0    A3  Star Alliance               Aegean Airlines
2    AB      One World                    Air Berlin
3    AC  Star Alliance                    Air Canada
12   CA  Star Alliance                     Air China
18   ET  Star Alliance            Ethiopian Airlines
22   JP  Star Alliance                 Adria Airways
23   KE        SkyTeam                    Korean Air
27   LH  Star Alliance                     Lufthansa
28   LO  Star Alliance           LOT Polish Airlines
30   ME        SkyTeam          Middle East Airlines
33   MS  Star Alliance                      Egyptair
37   OK        SkyTeam                Czech Airlines
38   OS  Star Alliance             Austrian Airlines
40   OZ  Star Alliance               Asiana Airlines
43   RJ      One World               Royal Jordanian
44   RO        SkyTeam                         Tarom
45   S7      One World                   S7 Airlines
47   SK  Star Alliance  Scandinavian Airlines 