# Out-of-scope Doc2Vec
Creates Doc2Vec representations from the OOS list.

In [10]:
# Imports
import re
from pprint import pprint

import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

Generate Doc2Vec for websites.

In [11]:
data_path = "uri_scores.csv"
file = pd.read_csv(data_path)

def iter_urls(file):
    for i, row in file.iterrows():
        url, text = row["original_url"], str(row["text"])
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", " ", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=7, vector_size=256, workers=4)
model.save("oos-doc2vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = Doc2Vec.load("oos-doc2vec")

Small tests.

In [12]:
pprint(model.docvecs.most_similar(positive=["https://itunes.apple.com"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://begrep.difi.no"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://www.sharp.fi"], topn=10))

[('http://itunes.apple.com', 0.9488991498947144),
 ('http://ax.itunes.apple.com', 0.8805733919143677),
 ('http://www.apple.com', 0.810758113861084),
 ('https://www.haugnett.no', 0.7921438217163086),
 ('http://www.lifesalt.it', 0.7871398329734802),
 ('http://www.midikaraoke.fr', 0.7832567095756531),
 ('http://maps.apple.com', 0.7820781469345093),
 ('http://www.sandrarossi.it', 0.781813383102417),
 ('https://www.apple.com', 0.7807577252388),
 ('https://www.neas.mr.no', 0.7788445353507996)]
[('http://vestorcollect.no', 0.7750795483589172),
 ('http://algarheim.backe.no', 0.7714511752128601),
 ('http://www.explorauto.co.cr', 0.7688676714897156),
 ('http://www.annelisenorheim.no', 0.7681679725646973),
 ('https://www.naob.no', 0.7664501667022705),
 ('https://grorud.osloskolen.no', 0.7660972476005554),
 ('http://facebook.no', 0.7657254338264465),
 ('http://monicaslykke.blogspot.com', 0.7653048634529114),
 ('http://t4h.chromecrm.com', 0.7646962404251099),
 ('http://jaktlia.backe.no', 0.76436507

Write to CSV for later analysis.

In [13]:
print(file.shape, len(model.docvecs.doctags))
fw = open("oos-doc2vec.csv", "w")
fw.write(",".join(file.columns) + "," + ",".join([f"v{i}" for i in range(256)]) + "\n")
for tag in model.docvecs.doctags:
    vec = model.docvecs[tag]
    rest = file[file["original_url"] == tag].values[0]
    fw.write(",".join([str(v).replace(",", "|") for v in rest]) + "," + ",".join([str(v) for v in vec]) + "\n")

(67881, 41) 67881


For converting to [TensorFlow embedding projector](https://projector.tensorflow.org/) compatible format

In [14]:
import pandas as pd

fn = "oos-doc2vec.csv"
csv = pd.read_csv(fn, index_col=False)

csv = csv.dropna(subset=[f"v{i}" for i in range(256)])
csv = csv.fillna("missing")

filt = "v\d+"
tsv1 = csv.filter(regex=filt)
tsv2 = csv.drop(tsv1.columns, axis=1)
tsv2 = tsv2.drop("text", axis=1)

tsv2 = tsv2.set_index("original_url")

tsv1.to_csv(fn.replace(".csv", "-vec.tsv"), header=False, sep="\t", index=False)
tsv2.to_csv(fn.replace(".csv", "-meta.tsv"), sep="\t", index=True)

  interactivity=interactivity, compiler=compiler, result=result)


For Weka compatibility

In [19]:
tsv2.reset_index().to_csv("oos.csv", index=False, quotechar="`")