Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 28 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
| #' @rdname parseTweets | |
| #' @export | |
| #' | |
| #' @title | |
| #' Converts tweets in JSON format to data frame. | |
| #' | |
| #' @description | |
| #' This function parses tweets downloaded using \code{filterStream}, | |
| #' \code{sampleStream} or \code{userStream} and returns a data frame. If tweet contains | |
| #' 280-character text it will return the complete text and not only 140 characters. | |
| #' | |
| #' @author | |
| #' Pablo Barbera \email{pablo.barbera@@nyu.edu} | |
| #' | |
| #' @param tweets A character string naming the file where tweets are stored or the | |
| #' name of the object in memory where the tweets were saved as strings. | |
| #' | |
| #' @param simplify If \code{TRUE} it will return a data frame with only tweet and user | |
| #' fields (i.e., no geographic information or url entities). | |
| #' | |
| #' @param verbose logical, default is \code{TRUE}, which will print in the console | |
| #' the number of tweets that have been parsed. | |
| #' | |
| #' @param legacy logical, default is \code{FALSE}. Read tweets using old method (reading lines into memory and parsing | |
| #' line by line). Try using \code{legacy=TRUE} if getting errors with default options. Note that legacy mode will | |
| #' only return up to 140 characters per tweet. | |
| #' | |
| #' @details | |
| #' \code{parseTweets} parses tweets downloaded using the \code{\link{filterStream}}, | |
| #' \code{\link{sampleStream}} or \code{\link{userStream}} functions | |
| #' and returns a data frame where each row corresponds to one tweet and each column | |
| #' represents a different field for each tweet (id, text, created_at, etc.). | |
| #' | |
| #' The total number of tweets that are parsed might be lower than the number of lines | |
| #' in the file or object that contains the tweets because blank lines, deletion notices, | |
| #' and incomplete tweets are ignored. | |
| #' | |
| #' To parse json to a twitter list, see \code{\link{readTweets}}. That function can be significantly | |
| #' faster for large files, when only a few fields are required. | |
| #' | |
| #' Note also that the \code{retweet_count} field contains the number of times a given tweet | |
| #' was retweeted at the time it was captured from the API, or for automatic retweets the number | |
| #' of times the original tweet was retweeted. | |
| #' | |
| #' @seealso \code{\link{filterStream}}, \code{\link{sampleStream}}, \code{\link{userStream}} | |
| #' | |
| #' @examples | |
| #' ## The dataset example_tweets contains 10 public statuses published | |
| #' ## by @@twitterapi in plain text format. The code below converts the object | |
| #' ## into a data frame that can be manipulated by other functions. | |
| #' | |
| #' data(example_tweets) | |
| #' tweets.df <- parseTweets(example_tweets, simplify=TRUE, legacy=TRUE) | |
| #' | |
| #' \dontrun{ | |
| #' ## A more complete example, that shows how to capture a user's home timeline | |
| #' ## for one hour using authentication via OAuth, and then parsing the tweets | |
| #' ## into a data frame. | |
| #' | |
| #' library(ROAuth) | |
| #' reqURL <- "https://api.twitter.com/oauth/request_token" | |
| #' accessURL <- "https://api.twitter.com/oauth/access_token" | |
| #' authURL <- "https://api.twitter.com/oauth/authorize" | |
| #' consumerKey <- "xxxxxyyyyyzzzzzz" | |
| #' consumerSecret <- "xxxxxxyyyyyzzzzzzz111111222222" | |
| #' my_oauth <- OAuthFactory$new(consumerKey=consumerKey, | |
| #' consumerSecret=consumerSecret, | |
| #' requestURL=reqURL, | |
| #' accessURL=accessURL, | |
| #' authURL=authURL) | |
| #' my_oauth$handshake() | |
| #' userStream( file="my_timeline.json", with="followings", | |
| #' timeout=3600, oauth=my_oauth ) | |
| #' tweets.df <- parseTweets("my_timeline.json") | |
| #' } | |
| #' | |
| #' | |
| parseTweets <- function(tweets, simplify=FALSE, verbose=TRUE, legacy=FALSE){ | |
| # if tweets is not a file or an object, try legacy mode | |
| if (!file.exists(tweets[1]) & !exists(tweets[1])){ | |
| legacy <- TRUE | |
| } | |
| if (!legacy){ | |
| results <- stream_in(tweets) | |
| # adding RT count from embedded RT or if not available, from top-level tweet | |
| retweet_count <- rep(NA, nrow(results)) | |
| # adding RT count from top-level tweet | |
| if (!is.null(results$retweet_count)){ | |
| retweet_count <- results$retweet_count | |
| } | |
| # replacing with RT count from embedded RT when available | |
| if (!is.null(results$retweeted_status.retweet_count)){ | |
| retweet_count <- ifelse(!is.na(results$retweeted_status.retweet_count), | |
| results$retweeted_status.retweet_count, results$retweet_count) | |
| } | |
| # same for favorite counts | |
| favorite_count <- rep(NA, nrow(results)) | |
| if (!is.null(results$favorite_count)){ | |
| favorite_count <- results$favorite_count | |
| } | |
| # replacing with RT count from embedded RT when available | |
| if (!is.null(results$retweeted_status.favorite_count)){ | |
| favorite_count <- ifelse(!is.na(results$retweeted_status.favorite_count), | |
| results$retweeted_status.favorite_count, results$favorite_count) | |
| } | |
| # extracting the full text when tweet is >140 characters | |
| # 1) tweets from REST API with tweet_mode = 'extended' | |
| error <- tryCatch(text <- results$full_text, error=function(e) e) | |
| # replacing text of RTs | |
| if (length(text)>0){ | |
| text <- ifelse(!is.na(results$retweeted_status.full_text), | |
| # full text from RT | |
| paste0('RT @', results$retweeted_status.user.screen_name, ':', | |
| results$retweeted_status.full_text), | |
| # 140+ text from original tweet | |
| text) | |
| } | |
| # 2) tweets from Streaming API that included 'extended_tweet' field | |
| if (inherits(error, 'error') || is.null(error)){ | |
| if (any(grepl('extended_tweet', names(results)))){ | |
| text <- results$extended_tweet.full_text | |
| # replacing with text if missing | |
| text <- ifelse(!is.na(text), text, results$text) | |
| # checking if there are any RTs | |
| if (!is.null(results$retweeted_status.extended_tweet.full_text)){ | |
| text <- ifelse(!is.na(results$retweeted_status.extended_tweet.full_text), | |
| # full text from RT | |
| paste0('RT @', results$retweeted_status.user.screen_name, ':', | |
| results$retweeted_status.extended_tweet.full_text), | |
| # 140+ text from original tweet | |
| text) | |
| } | |
| } | |
| # 3) tweets from Streaming API that do not include 'extended_tweet' field | |
| if (!any(grepl('extended_tweet', names(results)))){ | |
| text <- results$text | |
| } | |
| } | |
| df <- data.frame( | |
| text = text, | |
| retweet_count = retweet_count, | |
| favorite_count = favorite_count, | |
| favorited = results$favorited, | |
| truncated = results$truncated, | |
| id_str = results$id_str, | |
| in_reply_to_screen_name = results$in_reply_to_screen_name, | |
| source = results$source, | |
| retweeted = results[[grep("retweeted", names(results))[1]]], | |
| created_at = results$created_at, | |
| in_reply_to_status_id_str = results$in_reply_to_status_id_str, | |
| in_reply_to_user_id_str = results$in_reply_to_user_id_str, | |
| lang = results$lang, | |
| listed_count = results$user.listed_count, | |
| verified = results$user.verified, | |
| location = results$user.location, | |
| user_id_str = results$user.id_str, | |
| description = results$user.description, | |
| geo_enabled = results$user.geo_enabled, | |
| user_created_at = results$user.created_at, | |
| statuses_count = results$user.statuses_count, | |
| followers_count = results$user.followers_count, | |
| favourites_count = results$user.favourites_count, | |
| protected = results$user.protected, | |
| user_url = results$user.url, | |
| name = results$user.name, | |
| time_zone = results$user.time_zone, | |
| user_lang = results$user.lang, | |
| utc_offset = results$user.utc_offset, | |
| friends_count = results$user.friends_count, | |
| screen_name = results$user.screen_name, | |
| stringsAsFactors=F) | |
| # adding geographic variables and url entities | |
| if (simplify==FALSE){ | |
| df$country_code <- NA | |
| if (!is.null(results$place.country_code)) df$country_code <- results$place.country_code | |
| df$country <- NA | |
| if (!is.null(results$place.country)) df$country <- results$place.country | |
| df$place_type <- NA | |
| if (!is.null(results$place.type)) df$place_type <- results$place.type | |
| df$full_name <- NA | |
| if (!is.null(results$place.full_name)) df$full_name <- results$place.full_name | |
| df$place_name <- NA | |
| if (!is.null(results$place.name)) df$place_name <- results$place.name | |
| df$place_id <- NA | |
| if (!is.null(results$place.id)) df$place_id <- results$place.id | |
| place_lat_1 <- rep(NA, nrow(df)) | |
| if (!is.null(results$place.bounding_box.coordinates.0.0.1)){ | |
| place_lat_1 <- results$place.bounding_box.coordinates.0.0.1 | |
| } | |
| place_lat_2 <- rep(NA, nrow(df)) | |
| if (!is.null(results$place.bounding_box.coordinates.0.1.1)){ | |
| place_lat_2 <- results$place.bounding_box.coordinates.0.1.1 | |
| } | |
| df$place_lat <- sapply(1:nrow(df), function(x) | |
| mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE)) | |
| place_lon_1 <- rep(NA, nrow(df)) | |
| if (!is.null(results$place.bounding_box.coordinates.0.0.0)){ | |
| place_lon_1 <- results$place.bounding_box.coordinates.0.0.0 | |
| } | |
| place_lon_2 <- rep(NA, nrow(df)) | |
| if (!is.null(results$place.bounding_box.coordinates.0.2.0)){ | |
| place_lon_2 <- results$place.bounding_box.coordinates.0.2.0 | |
| } | |
| df$place_lon <- sapply(1:nrow(df), function(x) | |
| mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE)) | |
| df$lat <- NA | |
| if (!is.null(results$geo.coordinates.0)) df$lat <- results$geo.coordinates.0 | |
| df$lon <- NA | |
| if (!is.null(results$geo.coordinates.1)) df$lon <- results$geo.coordinates.1 | |
| df$expanded_url <- NA | |
| if (!is.null(results$entities.urls.0.expanded_url)){ | |
| df$expanded_url <- results$entities.urls.0.expanded_url | |
| } | |
| df$url <- NA | |
| if (!is.null(results$entities.urls.0.url)){ | |
| df$url <- results$entities.urls.0.url | |
| } | |
| } | |
| # removing empty rows | |
| df <- df[!is.na(df$id_str),] | |
| } | |
| if (legacy){ | |
| ## from json to list | |
| results.list <- readTweets(tweets, verbose=FALSE) | |
| # if no text in list, change it to NULL | |
| if (length(results.list)==0){ | |
| stop(deparse(substitute(tweets)), " did not contain any tweets. ", | |
| "See ?parseTweets for more details.") | |
| } | |
| # constructing data frame with tweet and user variable | |
| df <- data.frame( | |
| text = unlistWithNA(results.list, 'text'), | |
| retweet_count = unlistWithNA(results.list, 'retweet_count'), | |
| favorite_count = unlistWithNA(results.list, 'favorite_count'), | |
| favorited = unlistWithNA(results.list, 'favorited'), | |
| truncated = unlistWithNA(results.list, 'truncated'), | |
| id_str = unlistWithNA(results.list, 'id_str'), | |
| in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'), | |
| source = unlistWithNA(results.list, 'source'), | |
| retweeted = unlistWithNA(results.list, 'retweeted'), | |
| created_at = unlistWithNA(results.list, 'created_at'), | |
| in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'), | |
| in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'), | |
| lang = unlistWithNA(results.list, 'lang'), | |
| listed_count = unlistWithNA(results.list, c('user', 'listed_count')), | |
| verified = unlistWithNA(results.list, c('user', 'verified')), | |
| location = unlistWithNA(results.list, c('user', 'location')), | |
| user_id_str = unlistWithNA(results.list, c('user', 'id_str')), | |
| description = unlistWithNA(results.list, c('user', 'description')), | |
| geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')), | |
| user_created_at = unlistWithNA(results.list, c('user', 'created_at')), | |
| statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')), | |
| followers_count = unlistWithNA(results.list, c('user', 'followers_count')), | |
| favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')), | |
| protected = unlistWithNA(results.list, c('user', 'protected')), | |
| user_url = unlistWithNA(results.list, c('user', 'url')), | |
| name = unlistWithNA(results.list, c('user', 'name')), | |
| time_zone = unlistWithNA(results.list, c('user', 'time_zone')), | |
| user_lang = unlistWithNA(results.list, c('user', 'lang')), | |
| utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')), | |
| friends_count = unlistWithNA(results.list, c('user', 'friends_count')), | |
| screen_name = unlistWithNA(results.list, c('user', 'screen_name')), | |
| stringsAsFactors=F) | |
| # adding geographic variables and url entities | |
| if (simplify==FALSE){ | |
| df$country_code <- unlistWithNA(results.list, c('place', 'country_code')) | |
| df$country <- unlistWithNA(results.list, c('place', 'country')) | |
| df$place_type <- unlistWithNA(results.list, c('place', 'place_type')) | |
| df$full_name <- unlistWithNA(results.list, c('place', 'full_name')) | |
| df$place_name <- unlistWithNA(results.list, c('place', 'name')) | |
| df$place_id <- unlistWithNA(results.list, c('place', 'id')) | |
| place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2)) | |
| place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2)) | |
| df$place_lat <- sapply(1:length(results.list), function(x) | |
| mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE)) | |
| place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1)) | |
| place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1)) | |
| df$place_lon <- sapply(1:length(results.list), function(x) | |
| mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE)) | |
| df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1)) | |
| df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2)) | |
| df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url')) | |
| df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url')) | |
| } | |
| } | |
| # information message | |
| if (verbose==TRUE) message(length(df$text), " tweets have been parsed.") | |
| return(df) | |
| } | |
| unlistWithNA <- function(lst, field){ | |
| if (length(field)==1){ | |
| notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]]))) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field)) | |
| } | |
| if (length(field)==2){ | |
| notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]]))) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]])) | |
| } | |
| if (length(field)==3 & field[1]!="geo"){ | |
| notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]]))) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]])) | |
| } | |
| if (field[1]=="geo"){ | |
| notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]]))) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]])) | |
| } | |
| if (length(field)==4 && field[2]!="urls"){ | |
| notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0)) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])) | |
| } | |
| if (length(field)==4 && field[2]=="urls"){ | |
| notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0)) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]])) | |
| } | |
| if (length(field)==6 && field[2]=="bounding_box"){ | |
| notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0)) | |
| vect <- rep(NA, length(lst)) | |
| vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) | |
| x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]])) | |
| } | |
| return(vect) | |
| } |