Skip to content

Commit

Permalink
new model
Browse files Browse the repository at this point in the history
  • Loading branch information
mkearney committed Feb 8, 2020
1 parent 6ac66f8 commit 49ef5d6
Show file tree
Hide file tree
Showing 7 changed files with 347 additions and 114 deletions.
201 changes: 200 additions & 1 deletion R/explain.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,5 +153,204 @@ feature_info <- c(
twt_min30 = "Tweet count min in 30-minute intervals",
twt_place = "Tweet location",
tweets = "Analyzed tweets (max 200)",
BIAS = "Intercept (y when x=0)"
BIAS = "Intercept (y when x=0)",
dtime001 = "Shortest (nth) time between tweets",
dtime002 = "Shortest (nth) time between tweets",
dtime003 = "Shortest (nth) time between tweets",
dtime004 = "Shortest (nth) time between tweets",
dtime005 = "Shortest (nth) time between tweets",
dtime006 = "Shortest (nth) time between tweets",
dtime007 = "Shortest (nth) time between tweets",
dtime008 = "Shortest (nth) time between tweets",
dtime009 = "Shortest (nth) time between tweets",
dtime010 = "Shortest (nth) time between tweets",
dtime011 = "Shortest (nth) time between tweets",
dtime012 = "Shortest (nth) time between tweets",
dtime013 = "Shortest (nth) time between tweets",
dtime014 = "Shortest (nth) time between tweets",
dtime015 = "Shortest (nth) time between tweets",
dtime016 = "Shortest (nth) time between tweets",
dtime017 = "Shortest (nth) time between tweets",
dtime018 = "Shortest (nth) time between tweets",
dtime019 = "Shortest (nth) time between tweets",
dtime020 = "Shortest (nth) time between tweets",
dtime021 = "Shortest (nth) time between tweets",
dtime022 = "Shortest (nth) time between tweets",
dtime023 = "Shortest (nth) time between tweets",
dtime024 = "Shortest (nth) time between tweets",
dtime025 = "Shortest (nth) time between tweets",
dtime026 = "Shortest (nth) time between tweets",
dtime027 = "Shortest (nth) time between tweets",
dtime028 = "Shortest (nth) time between tweets",
dtime029 = "Shortest (nth) time between tweets",
dtime030 = "Shortest (nth) time between tweets",
dtime031 = "Shortest (nth) time between tweets",
dtime032 = "Shortest (nth) time between tweets",
dtime033 = "Shortest (nth) time between tweets",
dtime034 = "Shortest (nth) time between tweets",
dtime035 = "Shortest (nth) time between tweets",
dtime036 = "Shortest (nth) time between tweets",
dtime037 = "Shortest (nth) time between tweets",
dtime038 = "Shortest (nth) time between tweets",
dtime039 = "Shortest (nth) time between tweets",
dtime040 = "Shortest (nth) time between tweets",
dtime041 = "Shortest (nth) time between tweets",
dtime042 = "Shortest (nth) time between tweets",
dtime043 = "Shortest (nth) time between tweets",
dtime044 = "Shortest (nth) time between tweets",
dtime045 = "Shortest (nth) time between tweets",
dtime046 = "Shortest (nth) time between tweets",
dtime047 = "Shortest (nth) time between tweets",
dtime048 = "Shortest (nth) time between tweets",
dtime049 = "Shortest (nth) time between tweets",
dtime050 = "Shortest (nth) time between tweets",
dtime051 = "Shortest (nth) time between tweets",
dtime052 = "Shortest (nth) time between tweets",
dtime053 = "Shortest (nth) time between tweets",
dtime054 = "Shortest (nth) time between tweets",
dtime055 = "Shortest (nth) time between tweets",
dtime056 = "Shortest (nth) time between tweets",
dtime057 = "Shortest (nth) time between tweets",
dtime058 = "Shortest (nth) time between tweets",
dtime059 = "Shortest (nth) time between tweets",
dtime060 = "Shortest (nth) time between tweets",
dtime061 = "Shortest (nth) time between tweets",
dtime062 = "Shortest (nth) time between tweets",
dtime063 = "Shortest (nth) time between tweets",
dtime064 = "Shortest (nth) time between tweets",
dtime065 = "Shortest (nth) time between tweets",
dtime066 = "Shortest (nth) time between tweets",
dtime067 = "Shortest (nth) time between tweets",
dtime068 = "Shortest (nth) time between tweets",
dtime069 = "Shortest (nth) time between tweets",
dtime070 = "Shortest (nth) time between tweets",
dtime071 = "Shortest (nth) time between tweets",
dtime072 = "Shortest (nth) time between tweets",
dtime073 = "Shortest (nth) time between tweets",
dtime074 = "Shortest (nth) time between tweets",
dtime075 = "Shortest (nth) time between tweets",
dtime076 = "Shortest (nth) time between tweets",
dtime077 = "Shortest (nth) time between tweets",
dtime078 = "Shortest (nth) time between tweets",
dtime079 = "Shortest (nth) time between tweets",
dtime080 = "Shortest (nth) time between tweets",
dtime081 = "Shortest (nth) time between tweets",
dtime082 = "Shortest (nth) time between tweets",
dtime083 = "Shortest (nth) time between tweets",
dtime084 = "Shortest (nth) time between tweets",
dtime085 = "Shortest (nth) time between tweets",
dtime086 = "Shortest (nth) time between tweets",
dtime087 = "Shortest (nth) time between tweets",
dtime088 = "Shortest (nth) time between tweets",
dtime089 = "Shortest (nth) time between tweets",
dtime090 = "Shortest (nth) time between tweets",
dtime091 = "Shortest (nth) time between tweets",
dtime092 = "Shortest (nth) time between tweets",
dtime093 = "Shortest (nth) time between tweets",
dtime094 = "Shortest (nth) time between tweets",
dtime095 = "Shortest (nth) time between tweets",
dtime096 = "Shortest (nth) time between tweets",
dtime097 = "Shortest (nth) time between tweets",
dtime098 = "Shortest (nth) time between tweets",
dtime099 = "Shortest (nth) time between tweets",
dtime100 = "Shortest (nth) time between tweets",
dtime101 = "Shortest (nth) time between tweets",
dtime102 = "Shortest (nth) time between tweets",
dtime103 = "Shortest (nth) time between tweets",
dtime104 = "Shortest (nth) time between tweets",
dtime105 = "Shortest (nth) time between tweets",
dtime106 = "Shortest (nth) time between tweets",
dtime107 = "Shortest (nth) time between tweets",
dtime108 = "Shortest (nth) time between tweets",
dtime109 = "Shortest (nth) time between tweets",
dtime110 = "Shortest (nth) time between tweets",
dtime111 = "Shortest (nth) time between tweets",
dtime112 = "Shortest (nth) time between tweets",
dtime113 = "Shortest (nth) time between tweets",
dtime114 = "Shortest (nth) time between tweets",
dtime115 = "Shortest (nth) time between tweets",
dtime116 = "Shortest (nth) time between tweets",
dtime117 = "Shortest (nth) time between tweets",
dtime118 = "Shortest (nth) time between tweets",
dtime119 = "Shortest (nth) time between tweets",
dtime120 = "Shortest (nth) time between tweets",
dtime121 = "Shortest (nth) time between tweets",
dtime122 = "Shortest (nth) time between tweets",
dtime123 = "Shortest (nth) time between tweets",
dtime124 = "Shortest (nth) time between tweets",
dtime125 = "Shortest (nth) time between tweets",
dtime126 = "Shortest (nth) time between tweets",
dtime127 = "Shortest (nth) time between tweets",
dtime128 = "Shortest (nth) time between tweets",
dtime129 = "Shortest (nth) time between tweets",
dtime130 = "Shortest (nth) time between tweets",
dtime131 = "Shortest (nth) time between tweets",
dtime132 = "Shortest (nth) time between tweets",
dtime133 = "Shortest (nth) time between tweets",
dtime134 = "Shortest (nth) time between tweets",
dtime135 = "Shortest (nth) time between tweets",
dtime136 = "Shortest (nth) time between tweets",
dtime137 = "Shortest (nth) time between tweets",
dtime138 = "Shortest (nth) time between tweets",
dtime139 = "Shortest (nth) time between tweets",
dtime140 = "Shortest (nth) time between tweets",
dtime141 = "Shortest (nth) time between tweets",
dtime142 = "Shortest (nth) time between tweets",
dtime143 = "Shortest (nth) time between tweets",
dtime144 = "Shortest (nth) time between tweets",
dtime145 = "Shortest (nth) time between tweets",
dtime146 = "Shortest (nth) time between tweets",
dtime147 = "Shortest (nth) time between tweets",
dtime148 = "Shortest (nth) time between tweets",
dtime149 = "Shortest (nth) time between tweets",
dtime150 = "Shortest (nth) time between tweets",
dtime151 = "Shortest (nth) time between tweets",
dtime152 = "Shortest (nth) time between tweets",
dtime153 = "Shortest (nth) time between tweets",
dtime154 = "Shortest (nth) time between tweets",
dtime155 = "Shortest (nth) time between tweets",
dtime156 = "Shortest (nth) time between tweets",
dtime157 = "Shortest (nth) time between tweets",
dtime158 = "Shortest (nth) time between tweets",
dtime159 = "Shortest (nth) time between tweets",
dtime160 = "Shortest (nth) time between tweets",
dtime161 = "Shortest (nth) time between tweets",
dtime162 = "Shortest (nth) time between tweets",
dtime163 = "Shortest (nth) time between tweets",
dtime164 = "Shortest (nth) time between tweets",
dtime165 = "Shortest (nth) time between tweets",
dtime166 = "Shortest (nth) time between tweets",
dtime167 = "Shortest (nth) time between tweets",
dtime168 = "Shortest (nth) time between tweets",
dtime169 = "Shortest (nth) time between tweets",
dtime170 = "Shortest (nth) time between tweets",
dtime171 = "Shortest (nth) time between tweets",
dtime172 = "Shortest (nth) time between tweets",
dtime173 = "Shortest (nth) time between tweets",
dtime174 = "Shortest (nth) time between tweets",
dtime175 = "Shortest (nth) time between tweets",
dtime176 = "Shortest (nth) time between tweets",
dtime177 = "Shortest (nth) time between tweets",
dtime178 = "Shortest (nth) time between tweets",
dtime179 = "Shortest (nth) time between tweets",
dtime180 = "Shortest (nth) time between tweets",
dtime181 = "Shortest (nth) time between tweets",
dtime182 = "Shortest (nth) time between tweets",
dtime183 = "Shortest (nth) time between tweets",
dtime184 = "Shortest (nth) time between tweets",
dtime185 = "Shortest (nth) time between tweets",
dtime186 = "Shortest (nth) time between tweets",
dtime187 = "Shortest (nth) time between tweets",
dtime188 = "Shortest (nth) time between tweets",
dtime189 = "Shortest (nth) time between tweets",
dtime190 = "Shortest (nth) time between tweets",
dtime191 = "Shortest (nth) time between tweets",
dtime192 = "Shortest (nth) time between tweets",
dtime193 = "Shortest (nth) time between tweets",
dtime194 = "Shortest (nth) time between tweets",
dtime195 = "Shortest (nth) time between tweets",
dtime196 = "Shortest (nth) time between tweets",
dtime197 = "Shortest (nth) time between tweets",
dtime198 = "Shortest (nth) time between tweets",
dtime199 = "Shortest (nth) time between tweets"
)
36 changes: 36 additions & 0 deletions R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,18 @@ preprocess_bot.data.table <- function(x, batch_size = 100, ...) {
}
## if no batches, process and return
preprocess_bot_init(x)
if (!is.factor(x[, user_id])) {
x[, user_id := factor(user_id)]
}
uid <- levels(x[, user_id])

if (is.null(batch_size) || isFALSE(batch_size) || length(uid) <= batch_size) {
x <- preprocess_bot_group(x)
if (!is_ids(ogusrs)) {
x <- x[match(tolower(ogusrs), tolower(x[, screen_name])), ]
} else {
x <- x[match(ogusrs, x[, user_id]), ]
}
attr(x, ".ogusrs") <- ogusrs
return(x)
}
Expand All @@ -115,6 +123,11 @@ preprocess_bot.data.table <- function(x, batch_size = 100, ...) {
x <- do.call("rbind", x)

## put original users info back
if (!is_ids(ogusrs)) {
x <- x[match(tolower(ogusrs), tolower(x[, screen_name])), ]
} else {
x <- x[match(ogusrs, x[, user_id]), ]
}
attr(x, ".ogusrs") <- ogusrs

## return
Expand Down Expand Up @@ -265,6 +278,28 @@ preprocess_bot_group <- function(data) {
usr_actyr <- NULL
tweets <- NULL

##----------------------------------------------------------------------------##
## DTIME (TIME BETWEEN TWEETS) ##
##----------------------------------------------------------------------------##
## create copy of timestamp info
m <- data.table::copy(data[, .(user_id, created_at)])
## calculate time between tweets–sort from shortest to longest
m <- m[, .(dtime = c(NA_real_, abs(as.numeric(diff(created_at), "mins")))), by = user_id][
order(user_id, dtime), .(dtime, varname = paste0("dtime", seq_len(.N))), by = user_id]
## create complete version of dtimes (with missing values)
mna <- data.table::data.table(user_id = unique(m[, user_id]))[, .(dtime = NA_real_, varname = paste0("dtime", 1:200)), by = user_id]
## merge the two–removing duplicated rows from the NA dataset
m <- rbind(m, mna)[!duplicated(data.table::data.table(user_id, varname)), ]
## convert from long to wide for each user
m <- m[, {
structure(
as.list(dtime),
names = varname,
class = c("data.table", "data.frame")
)
}, by = user_id]
m <- m[, -"dtime200"]

##--------------------------------------------------------------------------##
## GROUP BY USER_ID ##
##--------------------------------------------------------------------------##
Expand Down Expand Up @@ -336,6 +371,7 @@ preprocess_bot_group <- function(data) {
]
data <- cbind(data[, -"usr_prfim"], model.matrix_(data[, .(usr_prfim)]))
data[, user_id := as.character(user_id)]
data <- merge(data, m)
data
}

Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
Loading

0 comments on commit 49ef5d6

Please sign in to comment.