Skip to content


Subversion checkout URL

You can clone with
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

183 lines (154 sloc) 7.807 kB
# CC-BY mhawksey
# This script processes data collected using this google spreadsheet template
# and processes the data, putting results back into the spreadsheet and generating data files for this template
# Read more about this at
# variables you'll need to edit
# project name (prefixed to output files)
project = "SCORE-UKOER"
# google spreadsheet key
key = "google-spreadsheet-key"
# Publsih as service url
# A secret set in the Script Editor of the spreadsheet to prevent unauthorised data entry
# define a list of accounts you want to compare
subgroup = c('ukoer', 'SCOREProject')
# sheet number (gid) of the vertices sheet
gid = 46
# query returning columns id, screen_name, friend_ids and description
query = 'select A, B, X, P'
#query returning screen_name betweenness_centrality degree_in followers_count group belongs_to profile_image_url
queryB = "select B, F, D, M, I, J, S where J <>''"
# function to get data from spreadsheet
gsqAPI = function(key,query,gid=0){
url=paste( sep="",'', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid)
return( read.csv( url,header =T ) )
# get data collected in google spreadsheet
tab = gsqAPI(key,query,gid)
# if the line above throws a warning copy the url included in the message to download csv data and put
# in your working directory, uncomment the next line and read the data locally
tab = read.csv( 'data.csv',header =T )
names = c("id", "name", "links" , "description")
colnames(tab) <- names
# Next 4 lines come for the awsesome jbaums and stackoverflow
conns <- function(name, links) {
paste(name, tab$name[tab$id %in% as.numeric(unlist(strsplit(gsub('\\[|\\]',",",gsub('\'|\"','', links)),',')))], sep=',')
connections <- data.frame(unlist(mapply(conns, tab$name, tab$links,SIMPLIFY=FALSE)))
connections <- str_split_fixed(connections[,1], fixed(","), 2)
data <-
#data <- data[![,2]),]
edcolNames <- c("target", "source")
colnames(data) <- edcolNames
edges = toJSON(data)
#data <- read.csv( "Myfile [Edges].csv",header =T )
g <-, directed = T)
# calculate some stats
betweenness_centrality <- betweenness(g,v=V(g),directed = TRUE)
degree<-degree(g, v=V(g), mode = "total")
degree_in<-degree(g, v=V(g), mode = "in")
degree_out<-degree(g, v=V(g), mode = "out")
memberships <- list()
sc <-, spins=10)
memberships$Spinglass <- sc$membership
wt <-, modularity=TRUE)
wmemb <-, wt$merges,
# choose which community grouping you want to push back to the spreadsheet
group <- sc$membership
#group <- wmemb$membership
# bind into matrice
cc <- c("screen_name", "degree","degree_in","degree_out","betweenness_centrality","eigenvector_centrality","pagerank","group")
colnames(datagrid) <- cc
rownames(datagrid) <- screen_name
# get a comma seperated list if user follows a sub group
# take a copy of the entire network edge list and add column names
edge = data.frame(data)
colnames(edge) <- c('vert1','vert2')
# filter the edge list where source only follows subgroup then sort
edge = edge[edge$vert2 %in% subgroup, ]
edge = edge[order(edge$vert2) , ]
# last part is to work down the grid of screen names (lappy)
# find a subset of the edge dataframe where vert1 is the screen name
# only selecting the results from vert2 e.g if mhawksey follows both
# accounts a two row dataframe is returned. Results are collasped into
# a comma seperated value and assigned to a belongs_to column in the datagrid
datagrid$belongs_to = lapply(datagrid$screen_name, function(dfx) paste(as.character(subset(edge, vert1==dfx,select = 'vert2')$vert2), collapse=","))
# convert to JSON
dg = toJSON(data.frame(t(datagrid)))
#get top results from data.frame
datagrid.m <- subset( datagrid, select = -belongs_to )
datagrid.m <- melt(datagrid.m, id = 1)
a.screen_name <- cast(datagrid.m, screen_name ~ . | variable)
a.screen_name.max <- aaply(a.screen_name, 1, function(x) arrange(x, desc(`(all)`))[1:10,])
datagrid.screen_name.max <- adply(a.screen_name.max, 1)
toptens <- cbind(datagrid.screen_name.max[,2],datagrid.screen_name.max[,3],datagrid.screen_name.max[,1])
toptens = toJSON(toptens)
toptenslabels = toJSON(colnames(datagrid))
write.graph(g, paste(project,".graphml"), "graphml")
# write back to spreadsheet
# SSL fix
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
# post form
postForm(serviceUrl, "datagrid" = dg, "toptens" = toptens, "toptenslabels" = toptenslabels, "edges" = edges, "secret" = secret)
# generate wordclouds based on groups
dataset = merge(datagrid,tab, by.x="screen_name",by.y="name")
keeps <- c("screen_name","group","description")
dataset = dataset[keeps]
groups =$group))
for(i in 1:nrow(groups)) {
descSub = subset(dataset, group==groups[i,1])
# this bit from
# note if you are pulling in multiple columns you may needd to change which one
# in the dataset is select e.g. dataset[,2] etc
ap.corpus <- Corpus(DataframeSource(data.frame(as.character(descSub$description))))
ap.corpus <- tm_map(ap.corpus, removePunctuation)
ap.corpus <- tm_map(ap.corpus, tolower)
ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
# additional stopwords can be used as shown below
#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, c("ukoer","oer")))
ap.tdm <- TermDocumentMatrix(ap.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
pal2 <- brewer.pal(8,"Dark2")
svg(paste(project,"-wordcloud-group",groups[i,1],".svg", sep=""))
# or png if you prefer
#png(paste(project,"-wordcloud-group",groups[i,1],".png", sep=""), width=1280,height=800)
wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
cc <- c("text", "size")
colnames(ap.d) <- cc
# write tables for each group with top 100 word freq
# do stuff with row
# get data csv for template (repulish the spreadsheet to make sure the data is fresh)
tab2 = gsqAPI(key,queryB,gid)
Jump to Line
Something went wrong with that request. Please try again.