# EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

%matplotlib inline

In [2]:
DATA_PATH = "data/2008.csv"

df = pd.read_csv(DATA_PATH)
df = df[df.Month == 6] # let's take just 1 month

### Some Cleaning

Let's keep only "valid" flights, i.e. those that weren't cancelled and have known departure and arrival times.

In [3]:
valid_flight_mask = (df.Cancelled == 0) & ~df.ArrTime.isna() & ~df.DepTime.isna()

print(f"{valid_flight_mask.mean():.1%} of flights are valid")
df = df[valid_flight_mask]

97.8% of flights are valid


### Time

Departure and arrival times are measured in `hours * 100` units.

In [4]:
df[['DepTime', 'ArrTime']].describe()

Unnamed: 0,DepTime,ArrTime
count,595458.0,595458.0
mean,1338.366889,1466.740123
std,490.269416,526.678842
min,1.0,1.0
25%,925.0,1053.0
50%,1326.0,1503.0
75%,1738.0,1911.0
max,2400.0,2400.0


In [5]:
through_midnight = (df.ArrTime <= df.DepTime).mean()
print(f"In {through_midnight:.1%} of cases ArrTime <= DepTime")

In 4.1% of cases ArrTime <= DepTime


We will stick to the following assumptions:
- `ArrTime <= DepTime` $\Rightarrow$ arrival happens on the next day
- `ArrTime > DepTime` $\Rightarrow$ departure and arrival both happen on the same day

We'll convert all time-related columns to unix timestamps for convenience.

In [6]:
dates = pd.to_datetime(df.rename(columns={'DayofMonth': 'Day'})[['Year', 'Month', 'Day']])
dateTs = dates.astype(int) // 10**9
df['depTs'] = dateTs + (df.DepTime / 100) * 3600
df['arrTs'] = dateTs + (df.ArrTime / 100) * 3600 + 86400 * (df.ArrTime <= df.DepTime).astype(int)

### Geography

In [7]:
origins = set(df.Origin.unique())
destinations = set(df.Dest.unique())
nodes = origins.union(destinations)
print(f"{len(origins)} unique origins\n{len(destinations)} unique destinations\n{len(nodes)} unique points")

293 unique origins
293 unique destinations
293 unique points
