Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
214 lines (176 sloc) 7.67 KB
utils::globalVariables(c("account_created_at", "created_at", "favorite_count",
"favourites_count", "followers_count", "friends_count", "is_quote",
"is_retweet", "listed_count", "n", "n_tweets", "retweet_count",
"statuses_count", "text", "user_id", "verified", "years_on_twitter",
"description", "location", "name"))
sum_ <- function(x) sum(x, na.rm = TRUE)
max_ <- function(x) max(x, na.rm = TRUE)
mean_ <- function(x) mean(x, na.rm = TRUE)
grepl_ <- function(pat, x) grepl(pat, x)
#' @importFrom rlang .data
extract_features_ytweets <- function(x) {
## remove retweet text and counts
x$text[x$is_retweet] <- NA_character_
x$retweet_count[x$is_retweet] <- NA_integer_
## remove user level duplicates
x_usr <- dplyr::filter(x, !duplicated(.data$user_id))
## tweet features
txt_df <- tf(
dplyr::select(x[!is.na(x$text), ], user_id = user_id, text = text))
names(txt_df)[-1] <- paste0("txt_", names(txt_df)[-1])
## base64 version
b64_df <- tf(
dplyr::select(x[!is.na(x$text), ], user_id = user_id, text = text))
names(b64_df)[-1] <- paste0("b64_", names(b64_df)[-1])
dsc_df <- tf(
dplyr::select(x_usr, user_id = user_id, text = description))
names(dsc_df)[-1] <- paste0("dsc_", names(dsc_df)[-1])
loc_df <- tf(
dplyr::select(x_usr, user_id = user_id, text = location))
names(loc_df)[-1] <- paste0("loc_", names(loc_df)[-1])
nm_df <- tf(
dplyr::select(x_usr, user_id = user_id, text = name))
names(nm_df)[-1] <- paste0("nm_", names(nm_df)[-1])
dd1 <- cbind(txt_df, b64_df[-1])
dd2 <- cbind(dsc_df, loc_df[-1])
dd2 <- cbind(dd2, nm_df[-1])
dd <- dplyr::left_join(dd1, dd2, by = "user_id")
x <- x %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(
n_sincelast = count_mean(since_last(.data$created_at)),
n_timeofday = count_mean(hourofweekday(.data$created_at)),
n = dplyr::n(),
n_retweets = sum_(.data$is_retweet),
n_quotes = sum_(.data$is_quote),
retweet_count = mean_(c(0, .data$retweet_count)),
favorite_count = mean_(c(0, .data$favorite_count)),
favourites_count = max_(c(0, .data$favourites_count)),
n_tweets = sum_(!.data$is_retweet & !.data$is_quote),
iphone = sum_(grepl_("iphone", .data$source)) / .data$n,
webclient = sum_(grepl_("web client", .data$source)) / .data$n,
android = sum_(grepl_("android", .data$source)) / .data$n,
hootsuite = sum_(grepl_("hootsuite", .data$source)) / .data$n,
lite = sum_(grepl_("twitter lite", .data$source)) / .data$n,
ipad = sum_(grepl_("for iPad", .data$source)) / .data$n,
google = sum_(grepl_("google", .data$source)) / .data$n,
ifttt = sum_(grepl_("IFTTT", .data$source)) / .data$n,
facebook = sum_(grepl_("facebook", .data$source)) / .data$n,
verified = as.integer(.data$verified[1]),
years_on_twitter = as.numeric(
difftime(Sys.time(), .data$account_created_at[1], units = "days")) / 365,
tweets_per_year = .data$n_tweets / (1 + .data$years_on_twitter),
## i added one here so it wouldn't return NaN or undefined values (0 / x)
statuses_count = max_(c(0, .data$statuses_count)),
followers_count = max_(c(0, .data$followers_count)),
friends_count = max_(c(0, .data$friends_count)),
listed_count = max_(c(0, .data$listed_count)),
tweets_to_followers = (.data$statuses_count + 1) /
(.data$followers_count + 1),
statuses_rate = (.data$statuses_count + 1) /
(.data$years_on_twitter + .001),
ff_ratio = (.data$followers_count + 1) /
(.data$friends_count + .data$followers_count + 1)
)
x <- x[names(x) != "n"]
dplyr::full_join(x, dd, by = "user_id") %>%
dplyr::group_by(user_id) %>%
dplyr::summarise_all(mean, na.rm = TRUE)
}
train_model <- function(data, n_trees = 1000) {
data <- data[!purrr::map_lgl(data,
~ all(is.na(.x)) || any(lengths(.x) != 1L))]
data <- data[purrr::map_lgl(data, ~ is.numeric(.x) | is.integer(.x))]
data <- data[purrr::map_lgl(data, ~ var(.x) > 0)]
## set params and run model (~ . means use all other variables)
gbm::gbm(bot ~ .,
data = data,
n.trees = n_trees,
interaction.depth = 2,
cv.folds = 3,
train.fraction = 1.0,
verbose = FALSE,
distribution = "bernoulli",
shrinkage = .1)
}
## write a function to print out the percent correct (overall; for bots, and
## for non-bots)
percent_correct <- function(data, m, n_trees = 500) {
best.iter <- gbm::gbm.perf(m, method = "cv", plot.it = FALSE)
data$pred <- gbm::predict.gbm(m, newdata = data,
n.trees = best.iter, type = "response")
x <- table(correct = data$pred > .5, bot = data$bot)
pc <- round((x[2, 2]) / sum_(x[, 2]), 4)
pc <- as.character(pc * 100)
message(sprintf("The model was %s%% accurate when classifying bots.\n", pc))
pc <- round((x[1, 1]) / sum_(x[, 1]), 4)
pc <- as.character(pc * 100)
message(sprintf("The model was %s%% accurate when classifying non-bots.\n",
pc))
pc <- round((x[1, 1] + x[2, 2]) / sum_(c(x[, 1], x[, 2])), 3)
pc <- as.character(pc * 100)
message(sprintf("Overall, the model was correct %s%% of the time.", pc))
}
classify_data <- function(x, model) {
##best.iter <- gbm::gbm.perf(model, method = "cv", plot.it = FALSE)
gbm::predict.gbm(model, n.trees = 700, newdata = x,
type = "response")
}
tf <- function(x) {
textfeatures::textfeatures(x, normalize = FALSE, word_dims = 0)
}
extract_features_ntweets <- function(x) {
## remove user level duplicates
x <- dplyr::filter(x, !duplicated(user_id))
x <- dplyr::group_by(x, user_id)
## remove user level duplicates
#x_usr <- dplyr::filter(x, !duplicated(.data$user_id))
## tweet features
txt_df <- tf(
dplyr::select(x[!is.na(x$text), ], user_id = user_id, text = text))
names(txt_df)[-1] <- paste0("txt_", names(txt_df)[-1])
txt_df[1:ncol(txt_df)] <- apply(txt_df, 2, function(.x)
ifelse(is.na(.x), 0, .x))
## base64 version
b64_df <- tf(
dplyr::select(x[!is.na(x$text), ], user_id = user_id, text = text))
names(b64_df)[-1] <- paste0("b64_", names(b64_df)[-1])
b64_df[1:ncol(b64_df)] <- apply(b64_df, 2, function(.x)
ifelse(is.na(.x), 0, .x))
dsc_df <- tf(
dplyr::select(x, user_id = user_id, text = description))
names(dsc_df)[-1] <- paste0("dsc_", names(dsc_df)[-1])
dsc_df[1:ncol(dsc_df)] <- apply(dsc_df, 2, function(.x)
ifelse(is.na(.x), 0, .x))
loc_df <- tf(
dplyr::select(x, user_id = user_id, text = location))
names(loc_df)[-1] <- paste0("loc_", names(loc_df)[-1])
loc_df[1:ncol(loc_df)] <- apply(loc_df, 2, function(.x)
ifelse(is.na(.x), 0, .x))
nm_df <- tf(
dplyr::select(x, user_id = user_id, text = name))
names(nm_df)[-1] <- paste0("nm_", names(nm_df)[-1])
nm_df[1:ncol(nm_df)] <- apply(nm_df, 2, function(.x)
ifelse(is.na(.x), 0, .x))
dd1 <- cbind(txt_df, b64_df[-1])
dd2 <- cbind(dsc_df, loc_df[-1])
dd2 <- cbind(dd2, nm_df[-1])
dd <- dplyr::left_join(dd1, dd2, by = "user_id")
x <- x %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(
favourites_count = max_(c(0, favourites_count)),
verified = as.integer(verified[1]),
years_on_twitter = as.numeric(
difftime(Sys.time(), account_created_at[1], units = "days")) / 365,
## i added one here so it wouldn't return NaN or undefined values (0 / x)
statuses_count = max_(c(0, statuses_count)),
followers_count = max_(c(0, followers_count)),
friends_count = max_(c(0, friends_count)),
listed_count = max_(c(0, listed_count)),
tweets_to_followers = (statuses_count + 1) / (followers_count + 1),
statuses_rate = (statuses_count + 1) / (years_on_twitter + .001),
ff_ratio = (followers_count + 1) / (friends_count + followers_count + 1)
)
dplyr::left_join(x, dd, by = "user_id")
}