# Veidemann Doc2Vec
Creates Doc2Vec representations from websites harvested by the Veidemann harvester.


In [6]:
# Imports
import json
import re

import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.decomposition import PCA

Generate Doc2Vec for Veidemann harvested websites.

In [9]:
data_path = "res/extracted_texts/veidemann/texts.ldjson"

def iter_urls(file):
    for line in f:
        j = json.loads(line)  # Each line contains a separate json entry
        url, text = j["requestedUri"], j["text"].lower()
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        file = open(data_path)
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", "", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=5, vector_size=128)

model.save("veidemann-doc2vec")

NameError: name 'f' is not defined

In [None]:
model = Doc2Vec.load("veidemann-doc2vec")

In [4]:
print(model.most_similar(positive=["nrk"]))
print(model.docvecs.most_similar(positive=["https://www.vg.no/"], topn=100))

  """Entry point for launching an IPython kernel.


[('tv2', 0.7558218836784363), ('tv', 0.7535334825515747), ('radio', 0.7260652184486389), ('vg', 0.7234106063842773), ('dagbladet', 0.7229463458061218), ('aftenposten', 0.7105423808097839), ('21', 0.6536192893981934), ('telenor', 0.6498087644577026), ('norsk', 0.6465280055999756), ('mars', 0.6435233354568481)]
[('https://www.dagbladet.no/', 0.824970543384552), ('http://topp.no/', 0.8183221817016602), ('https://www.nettavisen.no/', 0.8131425976753235), ('https://sol.no/', 0.7842087745666504), ('https://www.loggpaa.no/', 0.7810474634170532), ('https://www.vg.no/sport/', 0.7735840082168579), ('https://www.dagbladet.no/video', 0.7711979150772095), ('https://www.seher.no/', 0.7639572620391846), ('http://fxt.no/', 0.762884259223938), ('https://www.adressa.no/', 0.7524651885032654), ('https://www.tv2.no/', 0.741334080696106), ('https://www.dagsavisen.no/rogalandsavis', 0.7364118099212646), ('https://www.nrk.no/', 0.7339894771575928), ('https://www.dagsavisen.no/', 0.7241553068161011), ('https:

Run dimensionality reduction.

In [7]:
reduced = PCA(n_components=2).fit_transform(model.docvecs.vectors_docs)

np.save("veidemann-reduced.npy", reduced)

In [None]:
reduced = np.load("veidemann-reduced.npy")

Write to CSV for later analysis.

In [None]:
n_vec = model.docvecs.vectors_docs.shape[-1]

fw = open("all-pages.csv", "w")
labels = ["url"] + \
         [f"r{i}" for i in range(reduced.shape[-1])] + \
         [f"v{i}" for i in range(n_vec)]
fw.write(",".join(labels) + "\n")
for tag, red, vec in zip(model.docvecs.doctags, reduced, model.docvecs.vectors_docs):
    tag = '"' + re.sub(r'\\', "/", tag) + '"'  # makes it weka compatible
    red = [str(r) for r in red]
    vec = [str(v) for v in vec]
    fw.write(",".join([tag] + red + vec) + "\n")