### Ultrahack 2016 / MyData
    HSL: Kaupunkipyörien kysyntä 

In [1]:
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(geosphere))
suppressMessages(library(randomForest))
suppressMessages(library(parallel))

In [2]:
bikedata <- read.table('/users/ikonhen/omat/ultrahack/hsl_bike_data.csv', 
                       header = T, sep = '\t', stringsAsFactors = F)
str(bikedata)

'data.frame':	6167988 obs. of  9 variables:
 $ timestamp  : chr  "20160426T141326Z" "20160426T141326Z" "20160426T141326Z" "20160426T172701Z" ...
 $ name       : chr  "A21 Varsapuistikko" "B03 Haapaniemenkatu" "B08 Sörnäisten metroasema" "A21 Varsapuistikko" ...
 $ operative  : chr  "True" "True" "True" "True" ...
 $ style      : chr  "" "" "" "" ...
 $ lat        : num  60.2 60.2 60.2 60.2 60.2 ...
 $ lon        : num  24.9 25 25 24.9 25 ...
 $ total_slots: int  28 18 24 28 18 24 28 18 24 28 ...
 $ free_slots : int  28 18 24 28 18 23 28 18 22 28 ...
 $ avl_bikes  : int  0 0 0 0 0 1 0 0 2 0 ...


** Aggregoidaan aineisto tuntitasolle **

In [3]:
bikedata %>%

    tbl_df %>%

    mutate(station = substr(name, 1 ,3),
           month = as.numeric(substr(timestamp, 5, 6)),
           dayofmonth = as.numeric(substr(timestamp, 7, 8)),
           dayofweek = weekdays(as.Date(substr(timestamp, 1, 8), format = '%Y%m%d')),
           hour = as.numeric(substr(timestamp, 10, 11)),
           minute = as.numeric(substr(timestamp, 12, 13))) %>%

    select(-name, -timestamp, -operative, -style, -minute) %>%

    group_by(station, month, lat, lon, total_slots, dayofmonth, dayofweek, hour) %>%

    summarise(avg_availability = round(mean(avl_bikes)),
              max_availability = max(avl_bikes)) %>%

    mutate(hour_capacity_mean = round(avg_availability / total_slots)*100,
           hour_capacity_max = round(max_availability / total_slots)*100) -> bikedata

invisible(gc())
head(bikedata, 3)

Unnamed: 0,station,month,lat,lon,total_slots,dayofmonth,dayofweek,hour,avg_availability,max_availability,hour_capacity_mean,hour_capacity_max
1,A01,4,60.155411,24.950391,30,28,Torstai,5,0,0,0,0
2,A01,4,60.155411,24.950391,30,28,Torstai,8,0,0,0,0
3,A01,4,60.155411,24.950391,30,28,Torstai,9,0,0,0,0


** Haetaan asemille koordinaattitiedot ja lähimmät naapuriasemat **

In [4]:
station_info <- data.frame(station = unique(bikedata$station),
                           min_neighbor_dist = 0,
                           neighbor_1 = 'empty',
                           neighbor_2 = 'empty',
                           neighbor_3 = 'empty',
                           latitude = 0, 
                           longitude = 0)

factors <- sapply(station_info, is.factor)
station_info[factors] <- lapply(station_info[factors], as.character)                 

for (station in station_info$station) {
    stnumber <- grep(station, station_info$station)
    station_info[stnumber, c('latitude')] <- max(bikedata$lat[bikedata$station == station])
    station_info[stnumber, c('longitude')] <- max(bikedata$lon[bikedata$station == station])
}

head(station_info, 3)

Unnamed: 0,station,min_neighbor_dist,neighbor_1,neighbor_2,neighbor_3,latitude,longitude
1,A01,0,empty,empty,empty,60.155411,24.950391
2,A02,0,empty,empty,empty,60.159715,24.955212
3,A03,0,empty,empty,empty,60.158172,24.944808


In [5]:
for (station in station_info$station) {
    stnumber <- grep(station, station_info$station)
    other_stations <- station_info$station[-stnumber]
    distance <- c()
    results <- data.frame()
    i <- 1
    for (comparison in other_stations) {
        tryCatch({
            cmnumber <- grep(comparison, station_info$station)
            station_point <- station_info[stnumber, c('latitude', 'longitude')]
            comparison_point <- station_info[cmnumber, c('latitude', 'longitude')]
            distance[i] <- distHaversine(station_point, comparison_point)
            i <- i + 1
            }, error = function(e) { cat(paste("ERROR :", conditionMessage(e), station, comparison), "\n") })
    }
    results <- data.frame(station_1 = station, station_2 = other_stations, distance = distance)
    results <- results[order(distance), ]
    station_info[stnumber, c('min_neighbor_dist')] <- results[1, 3]
    station_info[stnumber, c('neighbor_1')] <- as.character(results[1, 2])
    station_info[stnumber, c('neighbor_2')] <- as.character(results[2, 2])
    station_info[stnumber, c('neighbor_3')] <- as.character(results[3, 2])
}

head(station_info, 3)

Unnamed: 0,station,min_neighbor_dist,neighbor_1,neighbor_2,neighbor_3,latitude,longitude
1,A01,681.115288302423,A03,A02,A07,60.155411,24.950391
2,A02,690.446191559871,A01,A13,A11,60.159715,24.955212
3,A03,396.897591474481,A04,A06,A07,60.158172,24.944808


** Mallinnusaineisto kasaan **

In [8]:
training_data <- left_join(bikedata, station_info[, c('station', 'min_neighbor_dist', 'neighbor_1', 
                                                      'neighbor_2', 'neighbor_3')],
                           by = 'station', all.x = T) 

str(training_data)

Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	134445 obs. of  16 variables:
 $ station           : chr  "A01" "A01" "A01" "A01" ...
 $ month             : num  4 4 4 4 4 4 4 4 4 4 ...
 $ lat               : num  60.2 60.2 60.2 60.2 60.2 ...
 $ lon               : num  25 25 25 25 25 ...
 $ total_slots       : int  30 30 30 30 30 30 30 30 30 30 ...
 $ dayofmonth        : num  28 28 28 28 28 28 29 29 29 29 ...
 $ dayofweek         : chr  "Torstai" "Torstai" "Torstai" "Torstai" ...
 $ hour              : num  5 8 9 10 11 13 5 11 12 13 ...
 $ avg_availability  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ max_availability  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ hour_capacity_mean: num  0 0 0 0 0 0 0 0 0 0 ...
 $ hour_capacity_max : num  0 0 0 0 0 0 0 0 0 0 ...
 $ min_neighbor_dist : num  681 681 681 681 681 ...
 $ neighbor_1        : chr  "A03" "A03" "A03" "A03" ...
 $ neighbor_2        : chr  "A02" "A02" "A02" "A02" ...
 $ neighbor_3        : chr  "A07" "A07" "A07" "A07" ...
 - attr(*, "vars")=L

In [None]:
#täs menee joku elämä, vois vähä nopeuttaa looppia
#BR, 
#atk- spesialisti

training_data$neighbor_1_cap <- apply(training_data, 1, function(rowlookup) {
    
    neighbor_1 <- rowlookup[['neighbor_1']]
    neighbor_2 <- rowlookup[['neighbor_2']]
    neighbor_3 <- rowlookup[['neighbor_3']]
    month <- rowlookup[['month']]
    dayofmonth <- rowlookup[['dayofmonth']]
    dayofweek <- rowlookup[['dayofweek']]
    hour <- as.numeric(rowlookup[['hour']]) - 1
    
    lag_availability_1 <-training_data$avg_availability[training_data$station == neighbor_1 &
                                                        training_data$month == month &
                                                        training_data$dayofmonth == dayofmonth &
                                                        training_data$dayofweek == dayofweek &
                                                        training_data$hour == hour]
    
    return(lag_availability_1)
})