# GIAN 9b: Building Networks

## 1. Setting up the data

This step takes a long time. You can skip it and just import the data in *step 2*.

We will use spaCy to tokenize the text, remove all the punctuation, and then save the data again in json format. In our new format, each piece of raw text will be replaced by a list of lemma's.

In [None]:
import json
import spacy
from collections import *
from itertools import *
from math import log
import en_core_web_sm

In [None]:
nlp=en_core_web_sm.load(disable=["parser","ner"])
def extract_lemmas(pipeline, s):
    return([word.lemma_ for word in pipeline(s) if word.pos_ is not 'PUNCT'])

In [None]:
with open("language_log.json", "r", encoding="utf-8") as f_in:
    posts=json.load(f_in)

In [None]:
from tqdm import tqdm_notebook as progress

for post in progress(posts, desc="Processed"):
    post['title']=extract_lemmas(nlp, post['title'])
    post['entry']=extract_lemmas(nlp, post['entry'])
    for comment in post.get('comments',[]):
        comment['body']=extract_lemmas(nlp, comment['body'])

In [None]:
with open("language_log_processed.json", "w", encoding="utf-8") as f_out:
    json.dump(posts, f_out, ensure_ascii=False)

## 2. Setting up simple networks

In [None]:
# install networkx if you haven't done so yet
!pip install networkx

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# first let's import our processed data again
with open("language_log_processed.json", "r", encoding="utf-8") as f_in:
    posts=json.load(f_in)

As our first exercise, let's make a simple Graph where we connect commenters together when they comment on the same post.

We will only use commenters who commented more than 100 times in total

In [None]:
commenters=Counter([comment['author'] for post in posts for comment in post.get('comments',[])])

In [None]:
frequent_commenters=set([commenter for commenter,comment_frequency in commenters.items() if comment_frequency>=100])

Let's also build a list of pairs of commenters (commenters who commented on the same post at least once) 

In [None]:
common_commenters=set()
for post in posts:
    all_post_commenters=set([comment['author'] for comment in post.get('comments',[])])
    frequent_post_commenters=all_post_commenters.intersection(frequent_commenters)
    for c1,c2 in combinations(frequent_post_commenters, 2):
        common_commenters.add((c1,c2))

Now we can build the graph of commenters

In [None]:
G=nx.Graph()
G.add_nodes_from(frequent_commenters)
G.add_edges_from(common_commenters)

And we can get some typical information from the Graph

In [None]:
# how many commenters are in the network
len(G.nodes)

In [None]:
# how many edges are in the network (links between commentors)
len(G.edges)

In [None]:
# how many other commenters is each commenter connected to
G.degree()

In [None]:
# who are the most connected commenters ?
ranked_commenters=sorted(G.degree(), key=lambda x:-x[1])
ranked_commenters[:10]

In [None]:
# who are the least connected commenters ?
ranked_commenters[-10:]

In [None]:
# how many components are there in the network, e.g. subnetworks that are not connected to eachother
components=list(nx.connected_components(G))
len(components)

In [None]:
# what is the shortest path between the two least connected commenters?
least_connected=([commenter_name for commenter_name, commenter_degree in ranked_commenters[-2:]])
nx.shortest_path(G, *least_connected)

In [None]:
# how many different shortest paths are there between these commenters?
len(list(nx.all_shortest_paths(G, *least_connected)))

In [None]:
# how close are commenters to other commenters on average?
nx.closeness_centrality(G)

A more complex example. Graphs based on commenters' language use.

In [None]:
import unicodedata
def has_control_characters(word):
    for character in word:
        if unicodedata.category(character)[0]=="C":
            result=True
            break
        else:
            result=False
    return(result)

In [None]:
commenter_words=defaultdict(list)
for post in posts:
    for comment in post.get("comments", []):
        commenter=comment['author']
        if commenter in frequent_commenters:
            for word in comment['body']:
                if not has_control_characters(word):
                    commenter_words[commenter].append(word)

In [None]:
# compute the document frequency for every word
# in this case, it is the number of commenters who use a word
df={}
for commenter, words in commenter_words.items():
    for word in set(words):
        df[word]=df.get(word,0)+1
# now compute the inverse document frequency for every word

n=len(frequent_commenters)
idf={word: log(1+n/f) for word, f in df.items()}

For each commenter, find the 50 words with the highest ${TF} \times {IDF}$ 

In [None]:
niw=50

In [None]:
ciw={} # commenter informative words

def highest(d, n):
    """Give the n highest scoring items in a dictionary"""
    ds=sorted(d.items(), key=lambda x: -x[1])
    return([item for item, value in ds][:n])

for commenter, words in commenter_words.items():
    ciw[commenter]=set(highest({word: log(f+1)*idf[word] for word, f in Counter(words).items()},niw))

In [None]:
ciw

In [None]:
# Now construct a graph based on whether commenters have informative words in common
G=nx.Graph()
G.add_nodes_from(frequent_commenters)
for commenter_a, commenter_b in combinations(frequent_commenters, 2):
    if len(ciw[commenter_a].intersection(ciw[commenter_b]))>0:
        G.add_edge(commenter_a, commenter_b)

In [None]:
plt.hist([degree for commenter, degree in nx.degree(G)])

In [None]:
nx.draw(G)

In [None]:
nx.draw_kamada_kawai(G)

In [None]:
# write the graph to a file in GraphML format
networkx.write_graphml(G, open("commenters_informative_words.graphml", "wb"))

In [None]:
# Construct the same graph, but now with weights on the edges
G=nx.Graph()
G.add_nodes_from(frequent_commenters)
for commenter_a, commenter_b in combinations(frequent_commenters, 2):
    weight=len(ciw[commenter_a].intersection(ciw[commenter_b]))/niw
    G.add_edge(commenter_a, commenter_b, weight=weight)

In [None]:
# write the graph to a file in GraphML format
networkx.write_graphml(G, open("commenters_informative_words_weighted.graphml", "wb"))

In [None]:
nx.draw(G)

In [None]:
# make a bipartite graph
from networkx.algorithms import bipartite
BG=nx.Graph()

In [None]:
commenter_nodes=["C: {:s}".format(commenter) for commenter in frequent_commenters]
BG.add_nodes_from(commenter_nodes, bipartite=0)

In [None]:
words=set()
for commenter,informative_words in ciw.items():
    words.update(informative_words)

In [None]:
BG.add_nodes_from(words, bipartite=1)

In [None]:
for commenter, informative_words in ciw.items():
    for word in informative_words:
        BG.add_edge("C: {:s}".format(commenter), word)

In [None]:
nx.is_bipartite(BG)

In [None]:
PGwords=nx.bipartite.collaboration_weighted_projected_graph(BG, words)

In [None]:
networkx.write_graphml(PGwords, open("pg_words.graphml", "wb"))

In [None]:
PGcommenters=nx.bipartite.collaboration_weighted_projected_graph(BG, commenter_nodes)

In [None]:
networkx.write_graphml(PGcommenters, open("pg_commenters.graphml", "wb"))