Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

221 lines (183 sloc) 6.223 kb
\documentclass[a4paper,10pt]{article}
\usepackage[utf8x]{inputenc}
\usepackage[top=1.5cm, bottom=1.5cm,left=2cm,right=1.5cm]{geometry}
%\usepackage{url}
\usepackage{listings}
%opening
\title{Statistical Analysis of ISMB 2012 Coverage at Twitter}
\author{Neil Saunders}
\begin{document}
\lstset{breaklines=true}
\SweaveOpts{concordance=TRUE}
\maketitle
\section{Preamble}
Load required libraries and data.
%load data
<<echo=TRUE, results=hide>>=
library(ggplot2)
library(xtable)
library(RColorBrewer)
library(tm)
library(wordcloud)
library(sentiment)
load("../../data/ismb.RData")
@
\section{Tweets per day}
\begin{center}
\setkeys{Gin}{width=0.6\textwidth}
<<fig=TRUE, echo=TRUE>>=
ismb$date <- as.Date(ismb$created)
byDay <- as.data.frame(table(ismb$date))
colnames(byDay) <- c("date", "tweets")
print(ggplot(byDay) + geom_bar(aes(date, tweets), fill = "salmon") + theme_bw()
+ opts(title = "ISMB 2012 tweets per day"))
@
\end{center}
\newpage
\section{Tweets per hour by day}
\begin{center}
<<fig=TRUE, echo=TRUE>>=
ismb$hour <- as.POSIXlt(ismb$created)$hour
byDayHour <- as.data.frame(table(ismb$date, ismb$hour))
colnames(byDayHour) <- c("date", "hour", "tweets")
byDayHour$hour <- as.numeric(as.character(byDayHour$hour))
print(ggplot(byDayHour) + geom_bar(aes(hour, tweets), fill = "salmon", binwidth = 1) +
facet_grid(date ~ hour) +
opts(axis.text.x = theme_blank(),
axis.ticks = theme_blank(),
panel.background = theme_blank(),
title = "ISMB 2012 tweets per hour by day"))
@
\end{center}
\newpage
\section{Popular talks}
First, get the hashtags:
<<echo=TRUE, results=verbatim>>=
words <- strsplit(ismb$text, " ")
hashtags <- lapply(words, function(x) x[grep("^#", x)])
hashtags <- unlist(hashtags)
hashtags <- tolower(hashtags)
hashtags <- gsub("[^A-Za-z0-9]", "", hashtags)
ht <- as.data.frame(table(hashtags))
ht <- ht[sort.list(ht$Freq, decreasing=F),]
@
\subsection{Keynotes}
\begin{center}
<<echo=TRUE, fig=TRUE>>=
kn <- ht[grep("^kn", ht$hashtags),]
kn$hashtags <- factor(kn$hashtags, levels = as.character(kn$hashtags))
print(ggplot(tail(kn)) + geom_bar(aes(hashtags, Freq), fill = "salmon") +
coord_flip() + theme_bw() + opts(title = "ISMB 2012 tweets - keynotes"))
@
\end{center}
KN 3: Analysis of transcriptome structure and chromatin landscapes (Barbara Wold).
\newpage
\subsection{Published Papers}
\begin{center}
<<echo=TRUE, fig=TRUE>>=
pp <- ht[grep("^pp", ht$hashtags),]
pp[47,2] <- 39
pp <- pp[-5,]
pp$hashtags <- factor(pp$hashtags, levels = as.character(pp$hashtags))
print(ggplot(tail(pp, 20)) + geom_bar(aes(hashtags, Freq), fill = "salmon") +
coord_flip() + theme_bw() + opts(title = "ISMB 2012 tweets - published papers"))
@
\end{center}
PP 44: Toward interoperable bioscience data (Susanna-Assunta Sansone).
\newpage
\subsection{Special Sessions}
\begin{center}
<<echo=TRUE, fig=TRUE>>=
ss <- ht[grep("^ss", ht$hashtags),]
ss$hashtags <- factor(ss$hashtags, levels = as.character(ss$hashtags))
print(ggplot(ss) + geom_bar(aes(hashtags, Freq), fill = "salmon") +
coord_flip() + theme_bw() + opts(title = "ISMB 2012 tweets - special sessions"))
@
\end{center}
SS 4: Bioinformatic Integration of Diverse Experimental Data Sources (Kyle Ellrott, David Haussler, Artem Sokolov, Josh Stuart).
\newpage
\subsection{Technology Track}
\begin{center}
<<echo=TRUE, fig=TRUE>>=
tt <- ht[grep("^tt", ht$hashtags),]
tt$hashtags <- factor(tt$hashtags, levels = as.character(tt$hashtags))
print(ggplot(tail(tt, 20)) + geom_bar(aes(hashtags, Freq), fill = "salmon") +
coord_flip() + theme_bw() + opts(title = "ISMB 2012 tweets - technology track"))
@
\end{center}
TT06: The Taverna Server - Executing Scientific Workflows Remotely (Katy Wolstencroft).
\newpage
\subsection{Workshops}
\begin{center}
<<echo=TRUE, fig=TRUE>>=
wk <- ht[grep("^wk", ht$hashtags),]
wk$hashtags <- factor(wk$hashtags, levels = as.character(wk$hashtags))
print(ggplot(wk) + geom_bar(aes(hashtags, Freq), fill = "salmon") +
coord_flip() + theme_bw() + opts(title = "ISMB 2012 tweets - workshops"))
@
\end{center}
WK 3: Bioinformatics Core Facilities (Simon Andrews, Fran Lewitter, Brent Richter, David Sexton).
\newpage
\section{Users}
\subsection{The long tail}
\begin{center}
<<fig=TRUE, echo=TRUE>>=
users <- as.data.frame(table(ismb$screenName))
colnames(users) <- c("user", "tweets")
users <- users[sort.list(users$tweets, decreasing = T),]
print(ggplot(users) + geom_point(aes(1:nrow(users), tweets), color = "salmon") + theme_bw() +
opts(title = "ISMB 2012 tweets per user") + xlab("User"))
@
\end{center}
\subsection{The top 10}
<<echo=FALSE,results=tex>>=
print(xtable(head(users, 10), caption = "Most tweets - top 10 users"), include.rownames = FALSE)
@
\newpage
\section{Text mining}
\subsection{Word frequency}
First, get all words composed only of alphanumeric characters.
<<echo=TRUE, results=tex>>=
sw <- stopwords("en")
words <- lapply(words, function(x) x[grep("^[A-Za-z0-9]+$", x)])
words <- unlist(words)
words <- tolower(words)
words <- words[-grep("^[rm]t$", words)]
words <- words[!words %in% sw]
words.t <- as.data.frame(table(words))
words.t <- words.t[sort.list(words.t$Freq, decreasing = T),]
print(xtable(head(words.t, 10), caption = "Top 10 words in tweets"), include.rownames = FALSE)
@
\setkeys{Gin}{width=0.6\textwidth}
\begin{center}
<<fig=TRUE, echo=TRUE>>=
pal2 <- brewer.pal(8, "Dark2")
wordcloud(words.t$words, words.t$Freq, scale = c(8, .2), min.freq = 3,
max.words = 200, random.order = FALSE, rot.per = .15, colors = pal2)
@
\end{center}
\newpage
\subsection{Sentiment analysis}
\subsubsection{Emotions}
<<echo=TRUE, results=tex>>=
em <- classify_emotion(ismb$text, algorithm = "bayes")
print(xtable(table(em[, "BEST_FIT"]), caption = "Tweet emotion"))
@
Shall we look at a "joyous" tweet?
\begin{lstlisting}
\Sexpr{ismb$text[3]}
\end{lstlisting}
\subsubsection{Polarity}
<<echo=TRUE, results=tex>>=
po <- classify_polarity(ismb$text, algorithm = "bayes")
print(xtable(table(po[, "BEST_FIT"]), caption = "Tweet polarity"))
@
A "positive" tweet:
\begin{lstlisting}
\Sexpr{ismb$text[7]}
\end{lstlisting}
A "negative" tweet:
\begin{lstlisting}
\Sexpr{ismb$text[19]}
\end{lstlisting}
\end{document}
Jump to Line
Something went wrong with that request. Please try again.