# DS8017 - Assignment 3

### Nujaimah Ahmed - 500955409

In [1]:
## get required datasets for Assignment
datadir = "/Users/nujaimah/Desktop/Assignment3"

In [2]:
import igraph as ig
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mode
import random

In [3]:
## define colors from pale to dark
colors = ["gainsboro", "silver", "darkgray", "dimgray", "black"]

## we will use 3 node sizes:
node_sizes = [6, 9, 12]

## Question 1)

In [4]:
## read edges and build weighted directed graph
df = pd.read_csv(datadir + "/connections.csv")
g_airport = ig.Graph.TupleList([tuple(x) for x in df.values], directed=True, edge_attrs=["weight"])
df.head()  ## look at a few edges

Unnamed: 0,orig_airport,dest_airport,total_passengers
0,SFO,LAX,1442105
1,LAX,SFO,1438639
2,MCO,ATL,1436625
3,ATL,MCO,1424069
4,LAX,JFK,1277731


In [5]:
## read vertex attributes and add to graph
Attr = pd.read_csv(datadir + "/airports_loc.csv")

## map airports in Attr to the node order in graph g
lookup = {k: v for v, k in enumerate(Attr["airport"])}
l = [lookup[x] for x in g_airport.vs()["name"]]

## save lat/lon as tuples for each node:
g_airport.vs()["layout"] = [(Attr["lon"][i], Attr["lat"][i]) for i in l]
g_airport.vs()["state"] = [Attr["state"][i] for i in l]
g_airport.vs()["city"] = [Attr["city"][i] for i in l]
Attr.head()  ## first few rows in Attr

Unnamed: 0,airport,lon,lat,state,city
0,ABE,-75.440804,40.6521,PA,Allentown
1,ABI,-99.6819,32.411301,TX,Abilene
2,ABQ,-106.609001,35.040199,NM,Albuquerque
3,ABR,-98.421799,45.4491,SD,Aberdeen
4,ABY,-84.194504,31.5355,GA,Albany


In [6]:
## add a few more attributes for visualization
g_airport.vs()["size"] = node_sizes[1]
g_airport.vs()["color"] = colors[3]
g_airport.es()["color"] = colors[0]
g_airport.es()["arrow_size"] = 0.5
print("Airport graph:", g_airport.vcount(), "nodes and", g_airport.ecount(), "directed edges")

Airport graph: 464 nodes and 12000 directed edges


In [7]:
## Build smaller subgraph 'G' for California
g_NY = g_airport.subgraph([v for v in g_airport.vs() if v["state"] == "NY"])

## drop isolated vertices (i.e. without in-state connections)
g_NY = g_NY.subgraph([v for v in g_NY.vs() if v.degree() > 0])

## remove loops if any
g_NY = g_NY.simplify(loops=True, multiple=False)
print(g_NY.vcount(), "nodes and", g_NY.ecount(), "directed edges")

13 nodes and 50 directed edges


In [8]:
## compute the normalized edge weights
max_weight = np.max(g_NY.es["weight"])
g_NY.es()["normalized_weight"] = [w / max_weight for w in g_NY.es()["weight"]]

## directed degree centrality
def degree_centrality(g, weights=None):
    n = g.vcount()
    if g.is_directed():
        dc = [
           ## sum in degrees, out degrees of every node
           ## divide by n - 1 = max # of neighbours node could have
           ## divide by 2 because of in and out degrees considered
            sum(x) / (2 * (n - 1))
            for x in zip(
                g.strength(mode="in", weights=weights), g.strength(mode="out", weights=weights)
            )
        ]
    else:
        ## undirected graph, don't divide by 2 (just sum edge weights)
        dc = [x / (n - 1) for x in g.strength(weights=weights)]
    return dc


## use distance = number of nodes below if disconnected
def closeness_centrality(g):
    n = g.vcount()
    ## for given pair of nodes how many hops away they are
    D = np.array(g.distances(mode="all"))
    ## if distances are infinite, i.e. in different connected components, set distance to n = total # of nodes (one possible solution for multiple connected components)
    D[D == np.inf] = n
    return [(n - 1) / sum(D[i]) for i in range(len(D))]

In [None]:
## compute several centrality measures for the California subgraph g_CA
df_central = pd.DataFrame(
    {
        "airport": g_NY.vs()["name"],
        "degree": degree_centrality(g_NY, weights="normalized_weight"),
        "pagerank": g_NY.pagerank(weights="weight"),
        "authority": g_NY.authority_score(weights="weight"),
        "hub": g_NY.hub_score(weights="weight"),
        "between": g_NY.betweenness(),
        "harmonic": g_NY.harmonic_centrality(),
        "closeness": closeness_centrality(g_NY),
        "eccentricity": g_NY.eccentricity(),
    }
)

## normalize the betweenness values, not normalized in igraph
n = g_NY.vcount()
df_central["between"] = [2 * x / ((n - 1) * (n - 2)) for x in df_central["between"]]

## sort w.r.t. degree centrality, look at top airports
df_central = df_central.sort_values(by="degree", ascending=False)
df_central.head(5)

### Question 1 Analysis

ads

## Question 2)

## Question 3)

## Question 5)