In [1]:
from gensim.models import Word2Vec, KeyedVectors
import os
data_dirpath = '/usr0/home/mamille2/erebor'

# Train word2vec on fanfiction data, initialized with Google News embeddings

### Add in fanfiction data

In [2]:
fandoms = [
    'friends',
    'detroit',
]

fanfic_sentences = {}

for f in fandoms:
    print(f)
    sentences_fpath = os.path.join(data_dirpath, 'fanfiction-project/data/ao3', f, f'ao3_{f}_sentences.txt')
    with open(sentences_fpath) as file_obj:
        fanfic_sentences[f] = file_obj.read().splitlines()
        print(len(fanfic_sentences[f]))

friends
180838
detroit
3577705


## Create model initialized with Google News 300-d embeddings

In [13]:
model = Word2Vec(size=300, min_count=5)

# sentences = [s.split() for s in fanfic_sentences['friends']]
sentences = [s.split() for s in fanfic_sentences['detroit']]
model.build_vocab(sentences)

len(model.wv.vocab)

70559

In [14]:
pretrained_fpath = os.path.join(data_dirpath, 'word_embeddings', 'GoogleNews-vectors-negative300.bin')
# model.wv = KeyedVectors.load_word2vec_format(os.path.join(data_dirpath, 'word_embeddings', 'GoogleNews-vectors-negative300.bin'), binary=True)
pretrained_wv = KeyedVectors.load_word2vec_format(pretrained_fpath, binary=True)

model.build_vocab([list(pretrained_wv.vocab.keys())], update=True) # should add words, though doesn't seem to
print(len(model.wv.vocab))

model.intersect_word2vec_format(pretrained_fpath, lockf=1.0, binary=True)

70559


In [15]:
# sentences = [s.split() for s in fanfic_sentences['friends']]
sentences = [s.split() for s in fanfic_sentences['detroit']]
model.train(sentences, total_examples=len(sentences), epochs=5)

(183449919, 265134090)

In [16]:
# model.save(os.path.join(data_dirpath, 'word_embeddings', 'friends_GoogleNews_300d.model'))
model.save(os.path.join(data_dirpath, 'word_embeddings', 'detroit_GoogleNews_300d.model'))

In [12]:
model.wv.most_similar('woman')

[('man', 0.6227999925613403),
 ('girl', 0.5778118371963501),
 ('person', 0.5152010917663574),
 ('guy', 0.5010389089584351),
 ('boy', 0.45616209506988525),
 ('brunette', 0.345905601978302),
 ('lady', 0.33502817153930664),
 ('creature', 0.3311230540275574),
 ('goddess', 0.33085495233535767),
 ('chick', 0.3299996256828308)]

In [15]:
model.wv.most_similar('gay')

[('bisexual', 0.379432737827301),
 ('funny', 0.35798341035842896),
 ('bad', 0.3567838668823242),
 ('jealous', 0.33581438660621643),
 ('crazy', 0.3278224468231201),
 ('good', 0.3259364068508148),
 ('embarrassed', 0.32154279947280884),
 ('weird', 0.31626904010772705),
 ('sorry', 0.3158738613128662),
 ('lucky', 0.29834669828414917)]

In [19]:
model.wv.most_similar('gay')

[('trans', 0.6030031442642212),
 ('bisexual', 0.5694502592086792),
 ('sexy', 0.5545810461044312),
 ('kinky', 0.5317354798316956),
 ('badass', 0.5257949233055115),
 ('awesome', 0.521049439907074),
 ('gross', 0.5168409943580627),
 ('homophobic', 0.5141972303390503),
 ('lesbian', 0.5126327276229858),
 ('creepy', 0.5085291862487793)]

In [18]:
model.wv.most_similar('heterosexual')

[('saddest', 0.2593832015991211),
 ('popular', 0.24050481617450714),
 ('undressed', 0.23664337396621704),
 ('cutest', 0.23346945643424988),
 ('dumbstruck', 0.23208709061145782),
 ('sexiest', 0.23113837838172913),
 ('protective', 0.2294725924730301),
 ('valid', 0.22827377915382385),
 ('ridiculous', 0.22641275823116302),
 ('inappropriate', 0.2237069308757782)]

In [20]:
model.wv.most_similar('heterosexual')

[('homosexual', 0.6994242668151855),
 ('lesbians', 0.6558328866958618),
 ('monogamous', 0.6392804384231567),
 ('bisexual', 0.6358998417854309),
 ('hetero', 0.6213570833206177),
 ('polyamorous', 0.6048110723495483),
 ('cohabiting', 0.5632133483886719),
 ('unmarried', 0.5617387294769287),
 ('monogamy', 0.5519148111343384),
 ('lesbian', 0.5502726435661316)]

In [26]:
model.wv.most_similar('wrong')

[('different', 0.33970576524734497),
 ('else', 0.3143661916255951),
 ('okay', 0.3065173923969269),
 ('weird', 0.30335676670074463),
 ('crazy', 0.2892380356788635),
 ('fine', 0.286884605884552),
 ('hell', 0.2854064106941223),
 ('happening', 0.2843957245349884),
 ('upset', 0.2835720479488373),
 ('mean', 0.2709137201309204)]

## Save embeddings in txt format

In [18]:
model.wv.save_word2vec_format(os.path.join(data_dirpath, 'word_embeddings', 'detroit_GoogleNews_300d.txt'), binary=False)

In [23]:
model = Word2Vec.load(os.path.join(data_dirpath, 'word_embeddings', 'friends_GoogleNews_300d.model'))

In [24]:
model.wv.save_word2vec_format(os.path.join(data_dirpath, 'word_embeddings', 'friends_GoogleNews_300d.txt'), binary=False)