In [None]:
install.packages(c("stringdist","feedeR","foreach","doParallel","rvest"))

library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)
library(foreach)


#feed titles
library(feedeR)

In [1]:

## GATHER RAW DATA

feeds <- c("http://feeds.bbci.co.uk/news/world/rss.xml",
            "http://feeds.bbci.co.uk/news/rss.xml",
            "http://feeds.skynews.com/feeds/rss/uk.xml",
            "http://feeds.skynews.com/feeds/rss/world.xml",
            "http://feeds.skynews.com/feeds/rss/us.xml",
            "http://feeds.reuters.com/Reuters/domesticNews",
            "http://feeds.reuters.com/Reuters/worldNews",
            "http://feeds.foxnews.com/foxnews/national",
            "http://feeds.foxnews.com/foxnews/world",
            "http://rssfeeds.usatoday.com/UsatodaycomWorld-TopStories",
            "http://rssfeeds.usatoday.com/UsatodaycomNation-TopStories",
            "http://rss.nytimes.com/services/xml/rss/nyt/World.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/Africa.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/Americas.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/Europe.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/MiddleEast.xml",
            "http://www.nytimes.com/services/xml/rss/nyt/US.xml",
            "http://www.telegraph.co.uk/news/rss.xml"
            )



dataset <- list()
for (i in 1:length(feeds)) {
    dataset <- tryCatch({
    extract <- feed.extract(feeds[i])
    extract <- cbind.data.frame(feed = extract$title, extract$items)
    rbind.data.frame(dataset, extract)
    }, error = function(e) { dataset })
}
dataset <- dataset[match(unique(dataset$link), dataset$link),]
titles <- sapply(dataset$title, function(s) {(strsplit(gsub("[[:punct:]]", " ", tolower(s)), " ")) })


Loading required package: foreach
Loading required package: iterators
Loading required package: parallel


Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing


In [6]:
nrow(dataset)

In [7]:

#feed contents
#library(rvest)

contents <- foreach(i = 1:nrow(dataset), .combine = rbind, .packages = "rvest") %dopar% {
    #i <- 10
    tryCatch({
        page <- read_html(dataset$link[i])
        paragraphs <- html_nodes(page, "p")
        p_classes <- html_attr(paragraphs, "class")
        p_text <- html_text(paragraphs)
        words <- as.character(unlist(sapply(p_text[is.na(p_classes) | grepl("intro", tolower(p_classes))],
                                            function(s) { strsplit(s, " ")})))
        out <- sapply(words[1:100], function(s) { tolower(gsub("[^[:alnum:][:space:]]", "", s)) })
        names(out) <- NULL
        #c(hash = dataset$hash[i], out)
        c(id=i, out)
    }, error = function(e) { "" })
}
#contents[1,]


In [8]:
nrow(contents)

In [9]:

## VECTORISE DATA

#create wordvector
contentWords <- table(c(contents))
contentWords <- cbind.data.frame(word = tolower(names(contentWords)[1:length(names(contentWords))]),
                                 count = contentWords)

wordVector <- unique(c(tolower(unlist(titles)),
                       tolower(unlist(contents))))
wordVector <- wordVector[wordVector != ""]


vectorisedData <- foreach(i = 1:nrow(dataset), .combine=rbind) %dopar% {
    #i <- 1
    tVector <- integer(length(wordVector))
    row <- contents[contents[, "id"] == i,]
    countVector <- table(c(titles[[i]], row[2:length(row)]))
    for (j in 1:length(countVector)) {
        index <- match(names(countVector)[j], wordVector)
        tVector[index] <- countVector[j]
    }
    t(tVector)
    #vectorisedData <- rbind.data.frame(vectorisedData, t(as.data.frame(tVector))) #tVector
}
names(vectorisedData) <- wordVector

vectorisedData <- as.matrix(vectorisedData)
colnames(vectorisedData) <- wordVector


In [10]:

## Word Bundles
correlationmatrix <- cor(vectorisedData)
bundles <- as.data.frame(foreach(i = 1:ncol(vectorisedData), .combine = rbind) %do% {
    #i<-1
    vec <- correlationmatrix[, i]
    bundle <- vec[vec > 0.9]
    c(id = i,
      bundle = paste(names(bundle[!is.na(bundle)]), collapse = "-"),
      value = sum(bundle[!is.na(bundle)]),
      first = colnames(correlationmatrix)[i],
      length = length(bundle[!is.na(bundle)]))
})
bundles <- bundles[match(unique(bundles$bundle[as.numeric(bundles$value) > 1]), bundles$bundle),]



"the standard deviation is zero"

In [11]:

## Synonyms
library(stringdist)
similaritymatrix <- matrix(foreach(i = 1:length(wordVector), .combine = rbind, .packages = "stringdist") %dopar% { stringdist(wordVector[i], wordVector) },
                           ncol = length(wordVector),
                           nrow = length(wordVector),
                           dimnames = list(wordVector,wordVector))
synonyms <- as.data.frame(foreach(i = 1:ncol(vectorisedData), .combine = rbind) %do% {
    #i<-1
    vec <- similaritymatrix[, i]
    synonym <- vec[vec < 0.25 * length(colnames(similaritymatrix)[i])]
    c(id = i,
      synonym = paste(names(synonym[!is.na(synonym)]), collapse = "-"),
      value = sum(synonym[!is.na(synonym)]))
})
synonyms <- synonyms[match(unique(synonyms$synonym[as.numeric(synonyms$value) > 1]), synonyms$synonym),]


In [12]:

## SELECT FEATURES
analytics <- cbind.data.frame(wordVector,
                              count = sapply(1:length(wordVector), function(w) { sum(vectorisedData[, w]) }),
                              mean = sapply(1:length(wordVector), function(w) { mean(vectorisedData[, w]) }),
                              stdev = sapply(1:length(wordVector), function(w) { sd(vectorisedData[, w]) }),
                              max = sapply(1:length(wordVector), function(w) { max(vectorisedData[, w]) }),
                              min = sapply(1:length(wordVector), function(w) { min(vectorisedData[, w]) }))

analytics$varration <- sapply(1:nrow(analytics), function(a) { analytics$stdev[a] / analytics$mean[a] })

test <- top_n(analytics, 50, analytics$count)
test <- analytics[rank(analytics$count, ties.method = "random"),]

lengths <- sapply(1:nrow(vectorisedData), function(i) { sum(vectorisedData[i,]) })
lengths <- cbind.data.frame(length = lengths)


library(dplyr)
#analytics[sort(analytics$count, decreasing = T),]
#selectedFeatures <- as.character(analytics[!is.na(analytics$varration) & analytics$varration > 2,]$wordVector)
#selectedFeatures <- as.character(top_n(analytics, 200, analytics$varration)$wordVector)
#selectedFeatures <- names(vectorisedData)
selectedFeatures <- as.character(bundles$first[as.numeric(bundles$length)>2])

featureSet <- matrix(as.numeric(vectorisedData[, selectedFeatures]),nrow = nrow(vectorisedData))#,
                               #date = with(dataset, (as.numeric(date) - quantile(as.numeric(date), 0.05)) / (mean(as.numeric(date)) - quantile(as.numeric(date), 0.05))),
                               #hash_ = dataset$hash)



ERROR: Error in eval(expr, envir, enclos): could not find function "top_n"


In [None]:

## K MEANS CLUSTERING
cartesianDistance <- function(v1, v2) {
    #sum(mapply(function(c1, c2) {(c1 - c2) ^ 2 }, v1, v2))^0.5
    ((v1 - v2) %*% t(v1 - v2))[[1]]^0.5
}

generateCentroid <- function() {
    centroid <- double(length(selectedFeatures))
    #centroid[sample(1:length(selectedFeatures), round(mean(lengths$length) + 1))] <- 1
    centroid <- as.numeric(featureSet[sample(1:nrow(dataset),1),])
    centroid
}

k <- 10
centroids <- list()
for (i in 1:k) {
    centroids <- rbind.data.frame(centroids, generateCentroid())
}
centroids <- as.matrix(centroids)
colnames(centroids)<- NULL  

G <- 100
history <- list()
g <- 1
while (g <= G) {
    dataset$cluster <- foreach(i = 1:nrow(featureSet), .combine = rbind) %dopar% {
        #i <- 1
        distances <- sapply(1:k, function(c) {
            #c <- 2
            V1 <- as.matrix(centroids[c,])
            V2 <- as.matrix(featureSet[i, ])
            (t(V1-V2) %*% (V1-V2))[[1]]^0.5
        })
        match(min(distances), distances)
    }

    newcentroids <- centroids
    for (i in 1:k) {
        #i <- 2
        if (nrow(dataset[dataset$cluster == i, ]) > 1) {
            #i <- 1
            members <- as.data.frame(featureSet[dataset$cluster == i,])
            for (j in 1:length(selectedFeatures)) {
                newcentroids[i, j] = mean(members[, j])
            }
        } else {
            newcentroid <- generateCentroid()
            for (j in 1:length(selectedFeatures)) {
                newcentroids[i, j] = newcentroid[j]
            }
        }
    }
    lastCentroids <- as.matrix(centroids)
    centroids <- as.matrix(newcentroids)

    dataset$distance <- foreach(i = 1:nrow(featureSet), .combine = rbind) %dopar% {
        # i <- 1
        V1 <- as.matrix(centroids[dataset$cluster[i],]) #, nrow = 1)
        V2 <- as.matrix(featureSet[i,])
        (t(V1 - V2) %*% (V1 - V2))[[1]] ^ 0.5
    }
                                     
    distances <- double(k)
    for (i in 1:k) {
        distances[i] <- sum(dataset$distance[dataset$cluster == i]) / length(dataset$distance[dataset$cluster == i])
    }
    history <- rbind.data.frame(history, cbind(g, t(distances), sum(distances)))

    g <- g + 1
}
names(history) <- c("generation", 1:k, "total")
View(history)


In [None]:

## ANALYSIS

test<-dataset[dataset$cluster==1,]
View(test)

test <- dataset[order(dataset$cluster, dataset$distance),]
View(test)

test2 <- merge(x = featureSet[, names(featureSet)[!names(featureSet) %in% selectedFeatures]],
               y = dataset,
               by.x = "hash_",
               by.y = "hash")

history

cl <- 9
result <- top_n(test2[test2$clusters_ == cl,], 10, test2$distance_[test2$clusters_ == cl])
View(result)
