# Out-of-scope Doc2Vec
Creates Doc2Vec representations from the OOS list.

In [2]:
# Imports
import re
from pprint import pprint

import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

Generate Doc2Vec for websites.

In [4]:
data_path = "uri_scores.csv"
file = pd.read_csv(data_path)

def iter_urls(file):
    for i, row in file.iterrows():
        url, text = row["original_url"], str(row["text"])
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", " ", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=7, vector_size=256, workers=4)
model.save("oos-doc2vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = Doc2Vec.load("oos-doc2vec")

Small tests.

In [5]:
pprint(model.docvecs.most_similar(positive=["https://itunes.apple.com"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://begrep.difi.no"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://www.sharp.fi"], topn=10))

TypeError: '<' not supported between instances of 'str' and 'int'

Write to CSV for later analysis.

In [6]:
print(file.shape, len(model.docvecs.doctags))
fw = open("oos-doc2vec.csv", "w")
fw.write(",".join(file.columns) + "," + ",".join([f"v{i}" for i in range(256)]) + "\n")
for tag in model.docvecs.doctags:
    vec = model.docvecs[tag]
    rest = file[file["original_url"] == tag].values[0]
    fw.write(",".join([str(v).replace(",", "|") for v in rest]) + "," + ",".join([str(v) for v in vec]) + "\n")

(70631, 41) 70631


For converting to [TensorFlow embedding projector](https://projector.tensorflow.org/) compatible format

In [7]:
import pandas as pd

fn = "oos-doc2vec.csv"
csv = pd.read_csv(fn, index_col=False)

csv = csv.dropna(subset=[f"v{i}" for i in range(256)])
csv = csv.fillna("missing")

filt = "v\d+"
tsv1 = csv.filter(regex=filt)
tsv2 = csv.drop(tsv1.columns, axis=1)
tsv2 = tsv2.drop("text", axis=1)

tsv2 = tsv2.set_index("original_url")

tsv1.to_csv(fn.replace(".csv", "-vec.tsv"), header=False, sep="\t", index=False)
tsv2.to_csv(fn.replace(".csv", "-meta.tsv"), sep="\t", index=True)

  interactivity=interactivity, compiler=compiler, result=result)


For Weka compatibility

In [9]:
tsv2.reset_index().to_csv("oos.csv", index=False, quotechar="`")