In [1]:
library(tidyverse)
library(lubridate)
library(magrittr)

investigate_id <- function(id, t) {
  # t is a tibble, id is an auction number on ebay
  # easier to investigate an auction
  #auctions <- camera_named %>%
    auctions <- t %>%
    filter(item_id == id) %>% 
    arrange(desc(bid_time))
  #rank(bid_time)
    #scale_x_continuous(breaks = seq_along(auctions)) +
  p <- ggplot(auctions, aes(x = bid_time, y = bid_price)) +
    geom_hline(aes(yintercept =  mean(buy_it_now), colour = 'BIN price'), alpha = 0.5, size = 2) +
    geom_hline(aes(yintercept =  mean(reserve_price), colour = 'reserve price')) +
    geom_hline(aes(yintercept =  max(bid_price), colour = 'highest price')) +
    scale_colour_manual(values = c("blue", "red", "green")) +
    geom_point() +
    scale_x_datetime(breaks = seq_along(auctions$bid_time), limits=c(min(auctions$bid_time),max(auctions$bid_time))) +
    labs(
      title = paste("Investigation for the auction of item #", id, sep = ""),
      x = "Time of Bid",
      y = "Bid price"
    )
  print(p)
  
  invisible(auctions)
}

#R.Version()

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang


ERROR: Error: package or namespace load failed for ‘tidyverse’ in dyn.load(file, DLLpath = DLLpath, ...):
 unable to load shared object '/home/peters/miniconda3/envs/sage/lib/R/library/stringi/libs/stringi.so':
  libicui18n.so.58: cannot open shared object file: No such file or directory


In [None]:
camera <- read_tsv("camera.txt", col_names = FALSE, col_types = 
                     cols(
                       X1 = col_integer(),
                       X2 = col_integer(),
                       X3 = col_character(),
                       X4 = col_character(),
                       X5 = col_double(),
                       X6 = col_character(),
                       X7 = col_double(),
                       X8 = col_integer(),
                       X9 = col_character(),
                       X10 = col_double(),
                       X11 = col_integer(),
                       X12 = col_logical(),
                       X13 = col_logical(),
                       X14 = col_logical(),
                       X15 = col_character(),  # use lubridate to parse them later
                       X16 = col_character(), 
                       X17 = col_character()
                     ))
#head(camera, n=10)

In [None]:
camera_named <- camera %>% 
  mutate(X15 = ymd_hms(X15) + years(5), # should be in year 2005
         X16 = ymd_hms(X16) + years(5),
         X17 = ymd_hms(X17) + years(5)
         ) %>% 
  rename(bid_id = X1, 
         item_id = X2, 
         product = X3, 
         model = X4, 
         buy_it_now = X5,    
         seller = X6, 
         reserve_price = X7, 
         seller_feedback = X8, 
         bidder = X9, 
         bid_price = X10, 
         buyer_feedback = X11,
         X12 = X12,
         X13 = X13,
         X14 = X14,
         bid_time = X15,
         start_time = X16,
         end_time = X17) %>% 
  filter(!is.na(bid_time)) # get rid of invalid bids
#head(camera_named,n=10)

In [None]:
sub_camera <- camera_named %>% select(item_id,buy_it_now,bidder,bid_price,reserve_price,bid_time,start_time,end_time)
sub_camera

In [None]:
sub_camera <- camera_named %>% 
  select(item_id,buy_it_now,bidder,bid_price,reserve_price,bid_time,start_time,end_time)


In [None]:
item_ids <- sub_camera %>% group_by(item_id) %>% 
 summarize( highest_bid = max(bid_price),
              selling_price = median(buy_it_now))
#table(item_ids$selling_price < item_ids$highest_bid)
ids_greater <- item_ids %>% filter(highest_bid > selling_price)
ids_greater

In [None]:
#filter(sub_camera,item_id == 324903140 )

In [None]:
#filter(camera_named,item_id == 324903140 )

In [None]:
#nrow(auctions) 

# 2) number of bidders: 18773, number of sellers: 1199
length(unique(camera_named$bidder))
# 3) average number of bidders per trade: 9.46
camera_named %>%  
  group_by(item_id) %>% 
  summarize(n = n_distinct(bidder, na.rm = TRUE)) %>% 
  summarize(mean = mean(n))

# The average number of bidders is substantially higher than (number of bidders) / (number of trades) 18773 / 4388 = 4.278259
# This means some of the bidders are very active

# 4) how active are the bidders
camera_named %>%  # the most active bidder bids 154 times
  filter(!is.na(bidder)) %>% 
  group_by(bidder) %>% 
  summarize(n = n()) %>% 
  arrange(desc(n)) %T>% 
  print() %>% 
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 3)

# 5) bid time distribution: very large of bids occurs near the ends and right after the starts
camera_named %>% 
  mutate(bid_time_relative = as.duration(bid_time - start_time) / as.duration(end_time - start_time)) %>% 
  ggplot(aes(x = bid_time_relative)) +
  geom_freqpoly()

# 6) transaction price (highest price) distribution: 
#           for cameras, the prices are more diverse than expected, especially for the lower ranges
#           also, the distribution of the highest bids is very close to the distribution of the buy it now prices
trades %>% 
  ggplot() +
  geom_freqpoly(aes(x = buy_it_now), colour = 'red') +
  geom_freqpoly(aes(x = highest_bid), colour = 'blue')

