# Data

From the Angoulême in 1764 project:
https://histecon.fas.harvard.edu/visualizing/angouleme/overview.html

In [None]:
df = read.csv("EDGESpublic6.11.20.csv")

In [None]:
head(df)

## Step 1: Turn this into all unweighted, directed edges.

In [None]:
edgelist = rbind(data.frame(source=df$Source, target=df$Target),
                 data.frame(source=df$Target[df$Type == 'Undirected'], target=df$Source[df$Type == 'Undirected']))

1. Number of edges?

2. Number of nodes?

Let's also grab the node data.

In [None]:
nodes = read.csv("NODESpublic6.11.20.csv")

In [None]:
head(nodes)

In [None]:
nrow(nodes)

In [None]:
max(nodes$ID)

## 1. Degree distribution

In [None]:
degrees = c()
for (source in nodes$ID) {
    degrees=c(degrees, sum(edgelist$source == source))
}

In [None]:
library(ggplot2)

In [None]:
ggplot(data.frame(degrees), aes(degrees)) + geom_histogram()

In [None]:
max(degrees)

In [None]:
nodes[which.max(degrees),]

In [None]:
df[df$Source == 3476,]

## 2. Shortest paths

In [None]:
all_shortest_paths <- function(start, iters=Inf) {
    dists = rep(Inf, max(nodes$ID))
    dists[start] = 0
    queue = c(start)
    
    while (length(queue) > 0 && iters > 0) {
        ## Dequeue first item
        node = queue[1]
        queue = queue[-1]
        mydist = dists[node]
        
        ## Consider each neighbor
        for (ii in which(edgelist$source == node)) {
            if (dists[edgelist$target[ii]] > mydist + 1) {
                dists[edgelist$target[ii]] = mydist + 1
                queue = c(queue, edgelist$target[ii])
            }
        }
        
        iters = iters - 1
    }
    
    dists
}

In [None]:
dists = all_shortest_paths(edgelist$source[1])

In [None]:
dists[dists != Inf]

In [None]:
meandists = c()
for (start in edgelist$source) {
    dists = all_shortest_paths(start)
    meandists = c(meandists, mean(dists[dists != 0 & dists != Inf]))
}

In [None]:
meandists

In [None]:
mean(meandists)

3. Clustering coefficient

In [None]:
calc_clustering = function(node) {
    neighbors = edgelist$target[edgelist$source == node] # j in A_ij
    triangles = 0
    for (node2 in neighbors) {
        neighbors2 = edgelist$target[edgelist$source == node2] # k in A_jk
        for (node3 in neighbors2) {
            if (node %in% edgelist$target[edgelist$source == node3]) 
                triangles = triangles + 1
        }
    }
    triangles / (length(neighbors) * (length(neighbors) - 1))
}

In [None]:
calc_clustering(edgelist$source[1])

In [None]:
clustering = c()
for (node in edgelist$source) {
    cc = calc_clustering(node)
    clustering = c(clustering, cc)
}

In [None]:
clustering

In [None]:
mean(clustering[is.finite(clustering)])

## 3. Doing the work with igraph

In [None]:
install.packages("igraph")

In [None]:
library(igraph)

In [None]:
net = graph_from_edgelist(as.matrix(edgelist), directed=T)

In [None]:
net

In [None]:
mean_distance(net)

In [None]:
distances(net, mode="out")

In [None]:
alldists = distances(net, mode="out")

In [None]:
dists = alldists[edgelist$source[1],]

In [None]:
dists[dists != Inf]

In [None]:
mean(dists[dists != 0 & dists != Inf])

In [None]:
meandists = apply(alldists, 1, function(dists) mean(dists[dists != 0 & dists != Inf]))

In [None]:
meandists[edgelist$source[1]]

In [None]:
mean(meandists, na.rm=T)

In [None]:
transitivity(net)

In [None]:
eigen_centrality(net)

In [None]:
length(eigen_centrality(net)$vector)

In [None]:
which.max(eigen_centrality(net)$vector)

In [None]:
nodes[nodes$ID == 3476,]

In [None]:
betweenness(net)

In [None]:
which.max(betweenness(net))

In [None]:
nodes[nodes$ID == 941,]

In [None]:
df[df$Source == 941 | df$Target == 941,]

## 4. Infect someone

In [None]:
infected = rep(F, max(nodes$ID))

In [None]:
infected[941] = T

In [None]:
epidemic = data.frame(time=0:1, infections=0:1)
while (epidemic$infections[nrow(epidemic)] != epidemic$infections[nrow(epidemic)-1]) {
    infected[edgelist$target[edgelist$source %in% which(infected)]] = T
    epidemic = rbind(epidemic, data.frame(time=epidemic$time[nrow(epidemic)]+1, infections=sum(infected)))
}

In [None]:
ggplot(epidemic, aes(time, infections)) + geom_line()

In [None]:
results = data.frame()
for (start in 1:max(nodes$ID)) {
    infected = rep(F, max(nodes$ID))
    infected[start] = T
    infections = 0
    while (sum(infected) > infections) {
        infections = sum(infected)
        infected[edgelist$target[edgelist$source %in% which(infected)]] = T
    }
    results = rbind(results, data.frame(start, infections))
}

In [None]:
head(results)

In [None]:
max(results$infections)

In [None]:
which.max(results$infections)