# Out-of-scope Doc2Vec
Creates Doc2Vec representations from the OOS list.

In [1]:
# Imports
import re
from pprint import pprint

import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

Generate Doc2Vec for websites.

In [23]:
data_path = "uri_scores.csv"
file = pd.read_csv(data_path)

def iter_urls(file):
    for i, row in file.iterrows():
        url, text = row["orig_url"], str(row["text"])
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", " ", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=7, vector_size=256, workers=4)
model.save("oos-doc2vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = Doc2Vec.load("oos-doc2vec")

Small tests.

In [19]:
pprint(model.docvecs.most_similar(positive=["https://itunes.apple.com"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://begrep.difi.no"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://www.sharp.fi"], topn=10))

[('http://ax.itunes.apple.com', 0.9485054016113281),
 ('http://itunes.apple.com', 0.9328069090843201),
 ('https://geo.itunes.apple.com', 0.9216475486755371),
 ('https://www.apple.com', 0.8240509033203125),
 ('https://www.amazon.com', 0.8226394653320312),
 ('http://www.aidostage.fr', 0.8219770193099976),
 ('http://www.correcomalma.com', 0.8207328915596008),
 ('http://amzn.com', 0.8118586540222168),
 ('http://www.dokteronline.it', 0.811695396900177),
 ('http://amazon.com', 0.8116165399551392)]
[('http://www.mote.no', 0.8155286312103271),
 ('http://planthunters-gartentagebuch.blogspot.com', 0.8011577129364014),
 ('http://vicoeiendom.no', 0.7955466508865356),
 ('http://www.vulkanmatsal.no', 0.7936769723892212),
 ('http://blinktunet.no', 0.7916043996810913),
 ('http://www.bagszip.com', 0.7908911108970642),
 ('http://www.naob.no', 0.7899017333984375),
 ('http://www.fujitsu-siemens.no', 0.7897645235061646),
 ('http://www.autotransport.no', 0.7897256016731262),
 ('http://www.ankerplassen.com',

Write to CSV for later analysis.

In [24]:
print(file.shape, len(model.docvecs.doctags))
fw = open("oos-doc2vec.csv", "w")
fw.write(",".join(file.columns) + "," + ",".join([f"v{i}" for i in range(256)]) + "\n")
for tag in model.docvecs.doctags:
    vec = model.docvecs[tag]
    rest = file[file["orig_url"] == tag].values[0]
    fw.write(",".join([str(v).replace(",", "|") for v in rest]) + "," + ",".join([str(v) for v in vec]) + "\n")


(72952, 40) 72952
