# Out-of-scope Doc2Vec
Creates Doc2Vec representations from the OOS list.

In [1]:
# Imports
import re
from pprint import pprint

import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Generate Doc2Vec for websites.

In [2]:
data_path = "uri_scores.csv"
file = pd.read_csv(data_path)

def iter_urls(file):
    for i, row in file.iterrows():
        url, text = row["original_url"], str(row["text"])
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", " ", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=7, vector_size=256, workers=4)
model.save("oos-doc2vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = Doc2Vec.load("oos-doc2vec")

Small tests.

In [3]:
pprint(model.docvecs.most_similar(positive=["https://itunes.apple.com"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://begrep.difi.no"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://www.sharp.fi"], topn=10))

[('https://geo.itunes.apple.com', 0.9591037631034851),
 ('http://itunes.apple.com', 0.9363000392913818),
 ('http://ax.itunes.apple.com', 0.9265631437301636),
 ('http://www.kaosagency.it', 0.8383361101150513),
 ('http://www.etudiemploi.fr', 0.8317094445228577),
 ('http://www.costaricamts.com', 0.831321656703949),
 ('http://www.castiholding.it', 0.8302295207977295),
 ('http://amzn.com', 0.8297784328460693),
 ('http://store.apple.com', 0.828658938407898),
 ('http://www.ghbellavista.it', 0.8285951614379883)]
[('https://www.hapro.no', 0.8215649127960205),
 ('http://nordlandsnett.no', 0.8142644166946411),
 ('http://kulas-terrasse.backe.no', 0.8098169565200806),
 ('http://econpartner.no', 0.8095749616622925),
 ('http://www.fujitsu-siemens.no', 0.7995861768722534),
 ('http://etikkradet.no', 0.7992410659790039),
 ('https://www.semac.no', 0.7986610531806946),
 ('http://www.juss24.no', 0.7979243397712708),
 ('http://www.asak.no', 0.7958770990371704),
 ('http://lillsk8.blogspot.com', 0.79252320528

Run dimensionality reduction.

In [None]:
pca = PCA()

trans = pca.fit_transform(model.docvecs.vectors_docs)

print(trans.shape)

# Plot explained variance (information kept) vs number of components
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
reduced = TSNE().fit_transform(trans)

np.save("veidemann-reduced.npy", reduced)

In [None]:
reduced = np.load("veidemann-reduced.npy")


Write to CSV for later analysis.

In [4]:
print(len(file), print(model.docvecs.vectors_docs.shape))
vector_df = pd.DataFrame(data=model.docvecs.vectors_docs, columns=[f"v{i}" for i in range(256)])
conc = pd.concat([file, vector_df], axis=1)
conc.to_csv("oos-doc2vec.csv", index=False)
# Nominal columns: 1,2,3,4,6,7,10,11,14,15,23

(69994, 256)
69994 None


In [15]:
# Find low scoring websites that are most similar to high scoring websites.
pprint(model.docvecs.most_similar(
    positive=[row["original_url"] for i, row in conc.iterrows() if row["norvegica_score"] > 0.5], topn=30))

[('http://www.arendal.com', 0.9722272753715515),
 ('http://marka-rogatec-kulinarika.si', 0.966809868812561),
 ('http://www.aspcontentmanagement.com', 0.9621041417121887),
 ('https://www.soehnleshop.dk', 0.9612629413604736),
 ('https://minside.sb1kapital.no', 0.9585440158843994),
 ('http://www.wella.no', 0.9576130509376526),
 ('https://radio.disney.com', 0.9575710296630859),
 ('http://faktaark.naturbase.no', 0.9573759436607361),
 ('https://esmart24web.no', 0.955369234085083),
 ('http://www.careerjet.vn', 0.9534828662872314),
 ('http://www.annelisenorheim.no', 0.9525358080863953),
 ('http://global.alltech.com', 0.9525244235992432),
 ('http://www.anycar.com', 0.9523997902870178),
 ('http://www.everycar.vn', 0.951771080493927),
 ('http://www.everycar.com.my', 0.9504650831222534),
 ('https://www.homeaway.tw', 0.9502320885658264),
 ('http://www.sinomach-pi.cn', 0.9489997625350952),
 ('http://www.christianlouboutinredbottoms.com', 0.9488970041275024),
 ('http://gerbergear.co.uk', 0.9479470252

For converting to [TensorFlow embedding projector](https://projector.tensorflow.org/) compatible format

In [13]:
fn = "oos-doc2vec.csv"
csv = pd.read_csv(fn, index_col=False)

csv = csv.dropna(subset=[f"v{i}" for i in range(256)])
csv = csv.fillna("missing")

filt = "v\d+"
vec = csv.filter(regex=filt)
meta = csv.drop(vec.columns, axis=1)
# meta = meta.drop("text", axis=1)
for column in meta:
    try:
        meta[column] = meta[column].str.replace("\n", " ")
    except AttributeError:
        pass

meta = meta.set_index("original_url")

vec.to_csv(fn.replace(".csv", "-vec.tsv"), header=False, sep="\t", index=False)
meta.to_csv(fn.replace(".csv", "-meta.tsv"), sep="\t", index=True)

For Weka compatibility

In [14]:
import arff
conc_notxt = conc.drop("text", axis=1).fillna("missing")
arff.dump("oos.arff", conc_notxt.values, "ooslist", conc_notxt.columns)
# tsv2.reset_index().to_csv("oos.csv", index=False, quotechar="`")