# Tube commute
Data analysis of the London Underground journey history based on the Oyster card data from the [Transport For London website](https://oyster.tfl.gov.uk/oyster/journeyHistory.do).
You can subscribe and get files containing all the trip details periodically emailed to you.

The goal of this project is to provide an easy way to explore TFL data. Once you loaded the script in R you can play with it interactively. An example session could look like:

In [None]:
source("./commute.r")

In [None]:
options(repr.plot.width=7, repr.plot.height=4)

# Preparation of the data

In [None]:
tfl_data <- load_folder('~/Documents/Data/TFL/')
home <- "Tottenham Hale [London Underground]"
work <- "Warren Street"
commute.to <- extract_commute_leg(tfl_data, home, work, 7, 60*3)
commute.from <- extract_commute_leg(tfl_data, work, home, 16, 60*3)
commute.labeled <- merge_labeled(commute.to, commute.from)

In [None]:
commute.to <- commute.to[order(commute.to$Date),]
commute.from <- commute.from[order(commute.from$Date),]
commute.labeled <- commute.labeled[order(commute.labeled$Date),]

In [None]:
head(commute.labeled)
summary(commute.labeled)

# Visualizations

In [None]:
start_time_from_vs_to <- function(commute.labeled){
    chart <- ggplot(commute.labeled, aes(x=Date, y=(Start.Time/60), colour= Direction)) +
        geom_point() +
        geom_smooth(method='lm',formula=y~x, se=FALSE)    
        scale_y_discrete(name="Ride start time (hour of the day)", limits=7:19)
    return(chart)
}
start_time_from_vs_to(commute.labeled)

In [None]:
visualise_time_vs_duration <- function(commute.labeled){
  format_hours <- function(minutes){
    return(sprintf("%d:%02d", as.integer(minutes%/%60), as.integer(minutes%%60)))
  }
  # Displaying points loses data. The more samples there are the more likely it is
  # that there will be a clash between different rides that touched in and out at the
  # same time.
  # The heatmap solves this issue but to get a decent visualisation requires
  # it's better to take bins of 5 minutes for the touch in time. This prevents too
  # many missing values to show up.
  # Keeping the full granularity for ride duration works fine.
  plot <- ggplot(commute.labeled, aes(Start.Time, Duration)) +
    geom_bin2d(drop=TRUE, binwidth=c(5,1), show.legend = FALSE) +
    facet_grid(. ~Direction, scales = "free") +
    scale_x_continuous("Touch in time", breaks = seq(420,1200,30), labels = format_hours) +
    scale_y_continuous("Ride duration", labels = scales::unit_format(unit = "min")) +
    scale_fill_continuous("Count", low = "white", high = "black", limits=c(0,NA)) +    
    theme_bw() +
    # for facet grid
    theme(strip.background = element_rect(colour="white", fill="white"), legend.position = 'bottom')
  return(plot)
}
visualise_time_vs_duration(commute.labeled)