In [138]:
# imports

import os, codecs
from collections import defaultdict

In [272]:
# configs

root_folder = "ToS3"
level = "l2"
year = "2009"

In [273]:
# parse and cleanup

edge_list = defaultdict(int)
all_gov = list()
all_targets = list()

for root, dirs, files in os.walk(root_folder):
    for f in files:
        if ".txt" in f and level in f and year in f:
            print(f)
            with codecs.open(os.path.join(root,f),encoding="utf8") as rf:
                for line in rf.readlines():
                    # remove webarchive info
                    line = line.replace("https://webarchive.nationalarchives.gov.uk/","")
                    # split the line into source, target and level
                    unpack = line.split("|")
                    if len(unpack) == 3:
                        source,target,_ = unpack
                    else:
                        continue
                    target_host = target.find("http")
                    source_host = source.find("http")
                    
                    if target_host and source_host:
                        # cleanup target host
                        target_host = target[target_host:]
                        target_host = target_host[target_host.find("://")+3:]
                        trailing_end = target_host.find("/")
                        if trailing_end > -1:
                            target_host = target_host[:trailing_end]
                        # remove www.
                        target_host = target_host.replace("www.","")
                        target_host = target_host.replace("www2.","")
                        # cleanup source host
                        source_host = source[source_host:]
                        source_host = source_host[source_host.find("://")+3:]
                        trailing_end = source_host.find("/")
                        if trailing_end > -1:
                            source_host = source_host[:trailing_end]
                        # remove www.
                        source_host = source_host.replace("www.","")
                        source_host = source_host.replace("www2.","")
                        if len(source_host) > 0 and len(target_host) > 0:
                            # remove IP addresses
                            if source_host.split(".")[0].isdigit() or target_host.split(".")[0].isdigit():
                                continue
                            all_gov.append(source_host)
                            all_targets.append(target_host)
                            edge_list[source_host+"$"+target_host] += 1

l2_summary_2009_10.txt
l2_pages_2009_7.txt
l2_pages_2009_10.txt
l2_pages_2009_11.txt
l2_pages_2009_6.txt
l2_summary_2009_11.txt
l2_summary_2009_13.txt
l2_pages_2009_4.txt
l2_pages_2009_13.txt
l2_pages_2009_12.txt
l2_pages_2009_5.txt
l2_summary_2009_12.txt
l2_pages_2009_1.txt
l2_pages_2009_16.txt
l2_pages_2009_17.txt
l2_pages_2009_0.txt
l2_pages_2009_2.txt
l2_pages_2009_15.txt
l2_pages_2009_14.txt
l2_pages_2009_3.txt
l2_summary_2009_8.txt
l2_summary_2009_9.txt
l2_summary_2009_4.txt
l2_summary_2009_5.txt
l2_summary_2009_7.txt
l2_summary_2009_6.txt
l2_summary_2009_2.txt
l2_summary_2009_3.txt
l2_summary_2009_1.txt
l2_summary_2009_0.txt
l2_pages_2009_19.txt
l2_pages_2009_18.txt
l2_pages_2009_8.txt
l2_pages_2009_9.txt


In [274]:
# create the target is_goc classification
all_gov = list(set(all_gov))

In [275]:
len(all_gov)

1693

In [276]:
[x for x in edge_list.keys()][:10]

['agender.org.uk$agender.org.uk',
 'agender.org.uk$pfc.org.uk',
 'agender.org.uk$gendertrust.org.uk',
 'agender.org.uk$gires.org.uk',
 'agender.org.uk$womenandequalityunit.gov.uk',
 'agender.org.uk$civilservice.gov.uk',
 'agender.org.uk$eoc.org.uk',
 'agender.org.uk$uk.geocities.com',
 'agender.org.uk$csbf.org.uk',
 'agender.org.uk$opsi.gov.uk']

In [277]:
# create the directed network in a csv file

list_of_cited = defaultdict(list)
list_of_citing = defaultdict(list)
with codecs.open("directed_edgelist_%s_%s.csv"%(year,level), "w", "utf8") as wf:
    wf.write("Source,Target,Weight,Target_is_gov\n")
    for k,v in edge_list.items():
        try:
            source,target = k.split("$")
        except:
            print(k)
            continue
        is_gov = False
        if target in all_gov:
            is_gov = True
        list_of_cited[source].append(target)
        list_of_citing[target].append(source)
        wf.write(source+","+target+","+str(v)+","+str(is_gov)+"\n")

In [278]:
# Link coupling (naive)

edge_list_coupling = defaultdict(int)
loc = [x for x in list_of_cited.keys()]
for n,s1 in enumerate(loc):
    for s2 in loc[n+1:]:
        weight = len(set(list_of_cited[s1]).intersection(set(list_of_cited[s2])))
        if weight > 0:
            edge_list_coupling[s1+"$"+s2] = weight 

In [279]:
len(loc)

1693

In [280]:
with codecs.open("coupling_edgelist_%s_%s.csv"%(year,level), "w", "utf8") as wf:
    wf.write("Source,Target,Type,Weight\n")
    for k,v in edge_list_coupling.items():
        try:
            source,target = k.split("$")
        except:
            print(k)
            continue
        wf.write(source+","+target+",Undirected,"+str(v)+"\n")

In [281]:
# Co-linking (naive)

edge_list_col = defaultdict(int)
loc = [x for x in list_of_citing.keys()]
for n,s1 in enumerate(loc):
    for s2 in loc[n+1:]:
        weight = len(set(list_of_citing[s1]).intersection(set(list_of_citing[s2])))
        if weight > 0:
            edge_list_col[s1+"$"+s2] = weight 

In [282]:
len(loc)

16520

In [283]:
with codecs.open("colink_edgelist_%s_%s.csv"%(year,level), "w", "utf8") as wf:
    wf.write("Source,Target,Type,Weight\n")
    for k,v in edge_list_col.items():
        try:
            source,target = k.split("$")
        except:
            print(k)
            continue
        wf.write(source+","+target+",Undirected,"+str(v)+"\n")