# Out-of-scope Doc2Vec
Creates Doc2Vec representations from the OOS list.

In [3]:
# Imports
import re
from collections import Counter
from pprint import pprint

import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


Generate Doc2Vec for websites.

In [5]:
data_path = "../uri_scores.csv"
file = pd.read_csv(data_path)

def iter_urls(file):
    for i, row in file.iterrows():
        url, text = row["original_url"], str(row["text"])
        yield url, text


class TaggedWebpageDocument(object):
    def __iter__(self):
        for url, text in iter_urls(file):
            words = [c for c in re.split(r"\s+", re.sub(r"[^\w\s]+", " ", text)) if len(c) > 0]
            yield TaggedDocument(words, [url])


documents = TaggedWebpageDocument()
model = Doc2Vec(documents, window=7, vector_size=256, workers=4)
model.save("oos-doc2vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = Doc2Vec.load("oos-doc2vec")

Small tests.

In [6]:
pprint(model.docvecs.most_similar(positive=["https://itunes.apple.com"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://begrep.difi.no"], topn=10))
pprint(model.docvecs.most_similar(positive=["https://www.sharp.fi"], topn=10))

[('http://itunes.apple.com', 0.9402926564216614),
 ('http://ax.itunes.apple.com', 0.938474178314209),
 ('https://geo.itunes.apple.com', 0.9134922027587891),
 ('http://www.ghbellavista.it', 0.8842248320579529),
 ('https://www.amazon.com', 0.8706847429275513),
 ('http://www.etudiemploi.fr', 0.8702532052993774),
 ('http://www.teknomec.it', 0.863261878490448),
 ('http://www.moviepilot.it', 0.8608096837997437),
 ('http://www.vinzetlou.fr', 0.8588129281997681),
 ('http://www.sandrarossi.it', 0.8583096265792847)]
[('https://klubbadmin.nif.no', 0.7878474593162537),
 ('https://www.alleteller.no', 0.7743078470230103),
 ('http://www.trenmedoss.no', 0.7734191417694092),
 ('http://tonstadost.backe.no', 0.7687683701515198),
 ('http://www.ark.no', 0.7686697840690613),
 ('http://www.klimagassregnskap.no', 0.7684811353683472),
 ('https://www.bokselskap.no', 0.7679361701011658),
 ('http://valbobehandling.no', 0.767586350440979),
 ('http://overhallafjos.no', 0.7672936320304871),
 ('http://econpartner.no'

Run dimensionality reduction.

In [None]:
pca = PCA()

trans = pca.fit_transform(model.docvecs.vectors_docs)

print(trans.shape)

# Plot explained variance (information kept) vs number of components
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
reduced = TSNE().fit_transform(trans)

np.save("veidemann-reduced.npy", reduced)

In [None]:
reduced = np.load("veidemann-reduced.npy")


Write to CSV for later analysis.

In [7]:
print(len(file), print(model.docvecs.vectors_docs.shape))
vector_df = pd.DataFrame(data=model.docvecs.vectors_docs, columns=[f"v{i}" for i in range(256)])
conc = pd.concat([file, vector_df], axis=1)
conc.to_csv("oos-doc2vec.csv", index=False)

(68675, 256)
68675 None


In [8]:
# Find low scoring websites that are most similar to high scoring websites.
pprint(model.docvecs.most_similar(
    positive=[row["original_url"] for i, row in conc.iterrows() if row["norvegica_score"] > 0.5], topn=30))

[('http://www.w-h-y.no', 0.9652109146118164),
 ('http://www.christianlouboutinredbottoms.com', 0.9632914662361145),
 ('https://www.mepalshop.dk', 0.9628714323043823),
 ('http://www.shining.no', 0.9627617597579956),
 ('https://www.soehnleshop.dk', 0.9626474976539612),
 ('http://www.wella.no', 0.9598065614700317),
 ('https://www.sharp.eu', 0.9595633745193481),
 ('http://www.jacobyoung.no', 0.9576820135116577),
 ('http://www.careerjet.vn', 0.9563694000244141),
 ('http://sketchingstamperdigi-stamps.blogspot.com', 0.95531165599823),
 ('https://www.scanjet.se', 0.9536997079849243),
 ('http://www.ec.europa.eu', 0.9532003402709961),
 ('http://fotoartikler.no', 0.9499859809875488),
 ('http://scanjet.se', 0.9488911032676697),
 ('http://www.antonsoggiu.com', 0.9487696886062622),
 ('https://www.sgs.ph', 0.9487102031707764),
 ('https://www.palookaville.no', 0.9487078785896301),
 ('https://bamble.nett.itum.tv', 0.9486806988716125),
 ('http://www.careerjet.com.kw', 0.9486024975776672),
 ('http://www.

For converting to [TensorFlow embedding projector](https://projector.tensorflow.org/) compatible format

In [9]:
fn = "oos-doc2vec.csv"
csv = pd.read_csv(fn, index_col=False)

csv = csv.dropna(subset=[f"v{i}" for i in range(256)])
csv = csv.fillna("missing")

filt = "v\d+"
vec = csv.filter(regex=filt)
meta = csv.drop(vec.columns, axis=1)
# meta = meta.drop("text", axis=1)
for column in meta:
    try:
        meta[column] = meta[column].str.replace("\n", " ")
    except AttributeError:
        pass

meta = meta.set_index("original_url")

vec.to_csv(fn.replace(".csv", "-vec.tsv"), header=False, sep="\t", index=False)
meta.to_csv(fn.replace(".csv", "-meta.tsv"), sep="\t", index=True)

For Weka compatibility

In [10]:
import arff
conc_notxt = conc.drop("text", axis=1).fillna("missing")
arff.dump("oos.arff", conc_notxt.values, "ooslist", conc_notxt.columns)

Generate bar charts and histograms for DataFrame

In [11]:
nominal = (1,2,3,4,6,7,10,11,14,15,23)

for index in range(len(meta.columns)):
    column = meta.iloc[:, index]
    if index+1 not in nominal:
        if column.dtype == np.float64:
            chart = plt.hist(column.values, bins=101, bottom=0)
        elif column.dtype == np.int64:
            chart = plt.hist(column.values, bins=min(max(column.values)+1, 101), bottom=0)
        else:
            continue
    else: 
        counter = Counter(column)
        labels, values = zip(*counter.most_common(20))
        indexes = np.arange(len(labels))
        width=1
        chart = plt.bar(indexes, values, width)
        plt.xticks(indexes, labels, rotation=90)
    plt.semilogy()
    plt.suptitle(column.name, y=0.95)
    # plt.tight_layout()
    plt.savefig(f"{column.name}.png", bbox_inches="tight")
    plt.clf()

<Figure size 432x288 with 0 Axes>