# Load, save word embeddings from doc2vec models

In [10]:
from gensim.models.doc2vec import Doc2Vec
import os

base_dirpath = '/usr0/home/mamille2/erebor/'

model_path = os.path.join(base_dirpath, 'fanfiction-project', 'models', 'academia-detroit-friends', 'PV-DBOW_d100n5mc2t20.model')

In [6]:
model = Doc2Vec.load(model_path)

In [7]:
type(model.wv)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [11]:
model.wv.save_word2vec_format(os.path.join(base_dirpath, 'word_embeddings', 'academia-detroit-friends_embeddings.txt'), binary=False)

# Prepare AO3 data in format for paragraph vector training

Preprocess content (tokenized, lowercased, like aclImdb/alldata-id.txt), extact relationship type metadata as tags

In [13]:
import pandas as pd

# Load in data from DiscourseDB
data = pd.read_csv('/usr2/mamille2/fanfiction-project/data/ao3/friends/friends_discoursedb_data.csv')
print(data.columns)
print(len(data))

Index(['annotations', 'content', 'contributionId', 'contributor',
       'discoursePartIds', 'discourseParts', 'parentId', 'startTime', 'title',
       'type', 'preceding_user', 'preceding_text'],
      dtype='object')
74199


In [14]:
import spacy
nlp = spacy.load('en')

def preprocess_text(text):
    return ' '.join([tok.text for tok in nlp.tokenizer(text.lower().replace('\n', ' '))]).strip()

In [15]:
from tqdm import tqdm_notebook as tqdm
data['preprocessed_content'] = list(map(preprocess_text, tqdm(data['content'])))
data['preprocessed_content']

HBox(children=(IntProgress(value=0, max=74199), HTML(value='')))




0        " so , what do we do with it , benny ? " ray a...
1        " well , i 'm sure someone has noticed that it...
2        " like the dinosaur it came from , for one . "...
3        fraser studied the bone carefully for a moment...
4        " you know this ? " ray was , as usual , amaze...
5        " note the prismoid shaft which tapers gradual...
6        " obvious to you , maybe . so what 'd your guy...
7                       " metatarsal , " benny corrected .
8        " okay , metatarsal . what 'd sinclair want wi...
9                                  " how should i know ? "
10       ray sighed , exasperated . " well , i thought ...
11       " do n't be silly , ray . the inuit have somet...
12       " can we focus here , benny ? this is not the ...
13       " i really do n't know what he wanted with it ...
14       " good luck ! from the looks of this room he '...
15       " good news , benny , " ray said as his friend...
16       " very funny , ray . " after two years , frase.

## Save out documents to their own lines (optional)

In [47]:
# Save paragraphs to their own lines in file
with open('/usr2/mamille2/fanfiction-project/data/ao3/friends/friends_paragraphs.txt', 'wb') as f:
    for p in data['preprocessed_content'].tolist():
        f.write(f'{p}\n'.encode('utf8'))
#         f.write('aééé'.encode('utf8').decode('utf8'))
#         f.write('aééé'.encode('utf8'))

## Extract annotations

In [87]:
pd.set_option('display.max_colwidth', -1)

data['annotations']

In [16]:
import re
# from IPython.core.debugger import set_trace

# Extract relationship type column
def extract_annotation_category(category, annotations):
    annotations_dict = {el[0]: ''.join(el[1:]) for el in [b.split(' (') for b in annotations.split(', ')]}
    try:
        vals = re.findall(r'null=(.*?)[;\)]', annotations_dict[category])
    except:
        set_trace()
    return vals

In [17]:
data['relationship_type'] = [extract_annotation_category('category', x) for x in tqdm(data['annotations'].tolist())]
data['relationship_type']

HBox(children=(IntProgress(value=0, max=74199), HTML(value='')))




0        [F/M, M/M]
1        [F/M, M/M]
2        [F/M, M/M]
3        [M/M, F/M]
4        [M/M, F/M]
5        [F/M, M/M]
6        [M/M, F/M]
7        [M/M, F/M]
8        [M/M, F/M]
9        [M/M, F/M]
10       [M/M, F/M]
11       [M/M, F/M]
12       [M/M, F/M]
13       [F/M, M/M]
14       [M/M, F/M]
15       [M/M, F/M]
16       [M/M, F/M]
17       [M/M, F/M]
18       [M/M, F/M]
19       [M/M, F/M]
20       [M/M, F/M]
21       [F/M, M/M]
22       [M/M, F/M]
23       [M/M, F/M]
24       [M/M, F/M]
25       [M/M, F/M]
26       [M/M, F/M]
27       [M/M, F/M]
28       [M/M, F/M]
29       [F/M, M/M]
            ...    
74169         [F/M]
74170         [F/M]
74171         [F/M]
74172         [F/M]
74173         [F/M]
74174         [F/M]
74175         [F/M]
74176         [F/M]
74177         [F/M]
74178         [F/M]
74179         [F/M]
74180         [F/M]
74181         [F/M]
74182         [F/M]
74183         [F/M]
74184         [F/M]
74185         [F/M]
74186         [F/M]
74187         [F/M]


## Save out annotations, preprocessed content

In [18]:
outpath = '/usr2/mamille2/fanfiction-project/data/ao3/friends/friends_discoursedb_data.pkl'
data.to_pickle(outpath)

# Gensim Doc2Vec with AO3 data (documents tagged with relationship type)
From https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [20]:
%%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from smart_open import smart_open

fpath = '/usr2/mamille2/fanfiction-project/data/ao3/friends/friends_discoursedb_data.pkl'
test_fraction = 0.1

data = pd.read_pickle(fpath)
total_docs = len(data)

alldocs = []
for line, tags in zip(data['preprocessed_content'], data['relationship_type']):
    tokens = gensim.utils.to_unicode(line).split()
    alldocs.append(TaggedDocument(words, tags))

# train_docs = [doc for doc in alldocs if doc.split == 'train']
# test_docs = [doc for doc in alldocs if doc.split == 'test']

# print('%d docs: %d train, %d test' % (len(alldocs), len(train_docs), len(test_docs)))

from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

print(len(doc_list))

74199
CPU times: user 776 ms, sys: 148 ms, total: 924 ms
Wall time: 927 ms


In [21]:
%%time

# Build models

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

# cores = multiprocessing.cpu_count()
cores = 20
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

Doc2Vec(dbow,d100,n5,mc2,t20) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t20) vocabulary scanned & state initialized
CPU times: user 6min 14s, sys: 1.27 s, total: 6min 16s
Wall time: 6min 16s


## Train models

In [None]:
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Adjust epochs in model 
    print()

Training Doc2Vec(dbow,d100,n5,mc2,t20)


KeyboardInterrupt: 


Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20)


## Qualitative word vector evaluation

In [12]:
import random
from IPython.display import HTML

# pick a random word with a suitable number of occurences
while True:
    word = random.choice(simple_models[0].wv.index2word)
    if simple_models[0].wv.vocab[word].count > 10:
        break
        
# or uncomment below line, to just pick a word from the relevant domain:
# word = 'difficult'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in simple_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in simple_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'smashing' (20 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t20)","Doc2Vec(""alpha=0.05"",dm/m,d100,n5,w10,mc2,t20)","Doc2Vec(dm/c,d100,n5,w5,mc2,t20)"
"[('mutations', 0.39711183309555054), ('stealthily', 0.39297765493392944), ('fondest', 0.38682976365089417), ('surfer', 0.3744651675224304), ('discouraging', 0.35691094398498535), ('dotted', 0.35466301441192627), ('uahan', 0.3428993225097656), ('rty', 0.342846542596817), ('thirds', 0.34089499711990356), ('shielding', 0.3404473662376404), ('authentic', 0.3356667757034302), ('lent', 0.3335157632827759), ('worshipped', 0.3328370749950409), ('plunged', 0.33239153027534485), ('engaging', 0.33150768280029297), ('slinks', 0.3224559724330902), ('inform', 0.32213205099105835), ('preps', 0.32165539264678955), ('acquaintances', 0.3209088146686554), ('claudia', 0.318649023771286)]","[('mutations', 0.39711180329322815), ('stealthily', 0.39297762513160706), ('fondest', 0.38682979345321655), ('surfer', 0.37446513772010803), ('discouraging', 0.3569110035896301), ('dotted', 0.35466307401657104), ('uahan', 0.3428993225097656), ('rty', 0.34284651279449463), ('thirds', 0.34089499711990356), ('shielding', 0.34044742584228516), ('lent', 0.3335157632827759), ('worshipped', 0.3328370749950409), ('plunged', 0.33239153027534485), ('engaging', 0.33150771260261536), ('slinks', 0.3224559724330902), ('inform', 0.32213202118873596), ('acquaintances', 0.3209088444709778), ('coffee', 0.31975361704826355), ('claudia', 0.3186489939689636), ('advancing', 0.3178688883781433)]","[('hugh', 0.40148839354515076), ('mutations', 0.39711180329322815), ('stealthily', 0.39297765493392944), ('fondest', 0.38682976365089417), ('surfer', 0.3744651675224304), ('orders', 0.35907721519470215), ('discouraging', 0.35691094398498535), ('dotted', 0.35466301441192627), ('uahan', 0.3428993225097656), ('rty', 0.342846542596817), ('thirds', 0.34089499711990356), ('shielding', 0.3404473662376404), ('uninhibited', 0.33743056654930115), ('lent', 0.3335157632827759), ('natural', 0.33288058638572693), ('worshipped', 0.3328370749950409), ('plunged', 0.33239153027534485), ('engaging', 0.33150768280029297), ('human', 0.329820841550827), ('slinks', 0.3224559724330902)]"


# Test Gensim Doc2Vec with AO3 data (document = paragraph)
From https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [49]:
%%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

fpath = '/usr2/mamille2/fanfiction-project/data/ao3/friends/friends_paragraphs.txt'
test_fraction = 0.1
total_docs = 74199

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
# plus adds other state helpful for our later evaluation/reporting
TaggedDocument = namedtuple('TaggedDocument', 'words tags split')

alldocs = []
with smart_open(fpath, 'rb', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # tags are labels; 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test'][line_no // int(total_docs * (1-test_fraction))]
        alldocs.append(TaggedDocument(words, tags, split))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

print('%d docs: %d train, %d test' % (len(alldocs), len(train_docs), len(test_docs)))

from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [50]:
%%time

# Build models

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

# cores = multiprocessing.cpu_count()
cores = 20
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

Doc2Vec(dbow,d100,n5,mc2,t20) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t20) vocabulary scanned & state initialized
CPU times: user 9.44 s, sys: 664 ms, total: 10.1 s
Wall time: 10.1 s


## Train models

In [51]:
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)
    print()

Training Doc2Vec(dbow,d100,n5,mc2,t20)
CPU times: user 3min 16s, sys: 18.4 s, total: 3min 34s
Wall time: 1min 51s

Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20)
CPU times: user 5min 17s, sys: 55.2 s, total: 6min 12s
Wall time: 2min 47s

Training Doc2Vec(dm/c,d100,n5,w5,mc2,t20)
CPU times: user 8min 27s, sys: 25.1 s, total: 8min 52s
Wall time: 2min 13s



In [56]:
import random
from IPython.display import HTML

# pick a random word with a suitable number of occurences
while True:
    word = random.choice(simple_models[0].wv.index2word)
    if simple_models[0].wv.vocab[word].count > 10:
        break
        
# or uncomment below line, to just pick a word from the relevant domain:
# word = 'spoilt'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in simple_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in simple_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'murmur' (13 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t20)","Doc2Vec(""alpha=0.05"",dm/m,d100,n5,w10,mc2,t20)","Doc2Vec(dm/c,d100,n5,w5,mc2,t20)"
"[('unsuspecting', 0.39443328976631165), ('cellphone', 0.3796854019165039), ('iffy', 0.372952401638031), ('excess', 0.3608635663986206), ('redford', 0.3578962981700897), ('hints', 0.3546637296676636), ('activated', 0.34775805473327637), ('happenings', 0.3403438925743103), ('scraping', 0.33970412611961365), ('with', 0.3392728269100189), ('dialled', 0.33752650022506714), ('flavor', 0.33679360151290894), ('è', 0.3287610709667206), ('on-', 0.32636886835098267), ('kämpfer', 0.32536137104034424), ('cowardice', 0.3245278596878052), ('naturedly', 0.3223896622657776), ('employers', 0.32194066047668457), ('absolution', 0.31825727224349976), ('valasus', 0.3180891275405884)]","[('whisper', 0.6813105344772339), ('growl', 0.5926134586334229), ('coward', 0.5922876000404358), ('moment', 0.573410153388977), ('burglar', 0.5642147064208984), ('mover', 0.5640881657600403), ('teenager', 0.5636329054832458), ('minute', 0.5632354021072388), ('halt', 0.5546646118164062), ('sob', 0.5538667440414429), ('skleničku', 0.5502578616142273), ('continent', 0.5448827743530273), ('huff', 0.5434749722480774), ('mockery', 0.5430886745452881), ('word', 0.5400181412696838), ('sniffle', 0.5378166437149048), ('click', 0.5377556681632996), ('volver', 0.5347557663917542), ('participant', 0.5343178510665894), ('bit', 0.5334388613700867)]","[('enrol', 0.6278350949287415), ('twig', 0.6177855730056763), ('snort', 0.6093369126319885), ('halt', 0.6067569255828857), ('fight.’', 0.5970606803894043), ('revolt', 0.591293215751648), ('yawn', 0.5861109495162964), ('sob', 0.5846139192581177), ('wedge', 0.5762349963188171), ('swerve', 0.5714218616485596), ('nestle', 0.5631072521209717), ('caress', 0.5629807710647583), ('miscommunication', 0.5629664659500122), ('sooth', 0.5585294365882874), ('rook', 0.5577542781829834), ('puddle', 0.5549665093421936), ('connoisseur', 0.5526505708694458), ('tweak', 0.5496055483818054), ('yuki', 0.547900378704071), ('swat', 0.546383261680603)]"


## Do close documents seem more related than distant ones?

In [60]:
import random
import numpy as np

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))

TARGET (37520): «alright ... " the blond stares at nico before suddenly inhaling sharply . " well , there you go ... "»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20):

MOST (74142, 0.7427983283996582): «nodded again . " okay . "»

MEDIAN (32786, 0.407391756772995): «closes the door behind her , and joey shakes his head . thank god rachel did n’t notice anything different about him .»

LEAST (40699, -0.3952307403087616): «_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _»



# Test Gensim Doc2Vec with provided data
From https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
%%time 

import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
from smart_open import smart_open
import re

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')
all_lines = []

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with smart_open(filename, 'wb') as f:
                f.write(r.content)
        # if error here, try `tar xfz aclImdb_v1.tar.gz` outside notebook, then re-run this cell
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()
    else:
        print("IMDB archive directory already available without download.")

    # Collect & normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    for fol in folders:
        temp = u''
        newline = "\n".encode("utf-8")
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        print(" %s: %i files" % (fol, len(txt_files)))
        with smart_open(os.path.join(dirname, output), "wb") as n:
            for i, txt in enumerate(txt_files):
                with smart_open(txt, "rb") as t:
                    one_text = t.read().decode("utf-8")
                    for c in control_chars:
                        one_text = one_text.replace(c, ' ')
                    one_text = normalize_text(one_text)
                    all_lines.append(one_text)
                    n.write(one_text.encode("utf-8"))
                    n.write(newline)

    # Save to disk for instant re-use on any future runs
    with smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(all_lines):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"
print("Success, alldata-id.txt is available for next steps.")

Success, alldata-id.txt is available for next steps.
CPU times: user 240 ms, sys: 40 ms, total: 280 ms
Wall time: 281 ms


In [2]:
%%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
# plus adds other state helpful for our later evaluation/reporting
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []
with smart_open('aclImdb/alldata-id.txt', 'rb', encoding='utf-8') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment
CPU times: user 8.72 s, sys: 1.27 s, total: 9.99 s
Wall time: 10 s


In [3]:
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [4]:
%%time
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

# cores = multiprocessing.cpu_count()
cores = 20
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dbow,d100,n5,mc2,t20) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t20) vocabulary scanned & state initialized
CPU times: user 40.8 s, sys: 4.29 s, total: 45.1 s
Wall time: 45.1 s


In [5]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

## Train models

In [6]:
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)
    print()

Training Doc2Vec(dbow,d100,n5,mc2,t20)
CPU times: user 25min 13s, sys: 58 s, total: 26min 11s
Wall time: 6min 20s
Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t20)
CPU times: user 53min 28s, sys: 2min 32s, total: 56min 1s
Wall time: 9min 54s
Training Doc2Vec(dm/c,d100,n5,w5,mc2,t20)
CPU times: user 2h 49s, sys: 2min 42s, total: 2h 3min 31s
Wall time: 9min 46s


In [7]:
import random
from IPython.display import HTML

# pick a random word with a suitable number of occurences
# while True:
#     word = random.choice(simple_models[0].wv.index2word)
#     if simple_models[0].wv.vocab[word].count > 10:
#         break
        
# or uncomment below line, to just pick a word from the relevant domain:
word = 'spoilt'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in simple_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in simple_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'spoilt' (97 occurences)


"Doc2Vec(dbow,d100,n5,mc2,t20)","Doc2Vec(""alpha=0.05"",dm/m,d100,n5,w10,mc2,t20)","Doc2Vec(dm/c,d100,n5,w5,mc2,t20)"
"[('ballet', 0.4222918152809143), ('65-minute', 0.41812703013420105), ('nit-wit', 0.38984495401382446), ('action-comedy', 0.3879348337650299), ('pimped', 0.37372422218322754), ('matara', 0.3736003637313843), ('brownstone', 0.37183868885040283), (""dick's"", 0.36779266595840454), (""rodger's"", 0.367227703332901), ('verano', 0.3622084856033325), ('consummately', 0.3592488169670105), (""macready's"", 0.3578265309333801), ('*such*', 0.3577822148799896), ('krassin', 0.3576417863368988), ('risks', 0.35670018196105957), ('callings', 0.35649171471595764), ('aic', 0.35543423891067505), ('distracts', 0.3541957139968872), (""freaks'"", 0.35217738151550293), ('ronreaco', 0.35198500752449036)]","[('spoiled', 0.6510179042816162), ('undermined', 0.5426546335220337), ('dominated', 0.5321130752563477), ('ruined', 0.5287238955497742), ('over-shadowed', 0.5230126976966858), ('marred', 0.5212410688400269), ('replaced', 0.5141274333000183), ('followed', 0.4941771924495697), ('siring', 0.49131008982658386), ('rejected', 0.4876922070980072), ('dwarfed', 0.48443150520324707), ('unencumbered', 0.4830629825592041), ('snatched', 0.479458749294281), ('populated', 0.47769853472709656), ('dazzled', 0.4664037823677063), ('inhabited', 0.46631473302841187), ('entranced', 0.4659886360168457), ('surrounded', 0.46457332372665405), ('bolstered', 0.463314026594162), ('saved', 0.46262553334236145)]","[('spoiled', 0.6605638265609741), ('ruined', 0.5010989904403687), ('disturbed', 0.49336788058280945), ('plagued', 0.48810648918151855), ('wooed', 0.48625805974006653), ('dampened', 0.4841140806674957), ('lightened', 0.4838625490665436), ('shaken', 0.47890323400497437), ('troubled', 0.4785526990890503), ('torpedoed', 0.47636711597442627), ('racked', 0.4725381135940552), ('maltreated', 0.4714895784854889), ('wounded', 0.46761342883110046), ('lessened', 0.4648064076900482), ('drugged', 0.46392497420310974), ('stimulated', 0.46230459213256836), ('breakable', 0.46102404594421387), ('saved', 0.4606311619281769), ('cuckolded', 0.4579746723175049), ('willed', 0.455874502658844)]"
