In [None]:
import pandas as pd
import glob
import logging
import nltk
import re
import gensim
import pickle
import numpy as np
import csv
import itertools
import os

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from nltk.tokenize import sent_tokenize
#tokenizer = nltk.data.load('tokenizers/punkt/dutch.pickle')
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [None]:
path = '../data/newspapers/test'
path2 = '../data/sentences/test'

In [None]:
allFiles = glob.glob(path2 + "/*")

In [None]:
#Train online

model = gensim.models.Word2Vec(min_count=50, size = 200, iter=10, window = 10, workers = 5, sg=1)
counter = 0
for file in allFiles:
    print(counter)
    sentences = gensim.models.word2vec.LineSentence(file)
    bigram_transformer = gensim.models.Phrases(sentences)
    bigram = gensim.models.phrases.Phraser(bigram_transformer)
    corpus = list(bigram[sentences])
    if counter == 0:
        model.build_vocab(corpus)
    else:
        model.build_vocab(corpus, update=True)
    counter += 1
model.train(corpus , total_examples=model.corpus_count, epochs=model.iter)

In [None]:
def getSentencesForYear(year):
    corpus = []
    for file_ in allFiles:
        filename = os.path.basename(file_)
        if filename.startswith(str(year)):
            sentences = gensim.models.word2vec.LineSentence(file)      
    return sentences

def getSentencesInRange(startY, endY):
    return [s for year in range(startY, endY) for s in getSentencesForYear(year)]    

def train_models():
    model = gensim.models.Word2Vec(min_count=50, size = 200, iter=10, window = 10, workers = 5, sg=1)
    
    yearsInModel = 1    
    stepYears = 1
    modelFolder = '../models'

    y0 = 1950
    yN = 1956

    for year in range(y0, yN-yearsInModel+1, stepYears):
        startY = year
        endY = year + yearsInModel
        modelName = modelFolder + '/%d_%d.w2v'%(year,year+yearsInModel)
        print('Building Model: ', modelName)

        sentences = getSentencesInRange(startY, endY)
        bigram_transformer = gensim.models.Phrases(sentences)
        bigram = gensim.models.phrases.Phraser(bigram_transformer)
        corpus = list(bigram[sentences])
        
        model.build_vocab(corpus)
        model.train(corpus , total_examples=model.corpus_count, epochs=model.iter)
        print('....saving')
        model.init_sims(replace=True)
        model.wv.save_word2vec_format(modelName, binary=True)

    
    

In [None]:
train_models()

In [None]:
model = gensim.models.Word2Vec.load('../models/1950_1951.w2v')

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('../models/1950_1951.w2v', binary=True)


In [None]:
model = gensim.models.Word2Vec(min_count=50, size = 200, iter=5, alpha=0.025, window = 10, workers = 5, sg=0)
model.build_vocab(corpus)
model.train(corpus , total_examples=model.corpus_count, epochs=model.iter)

In [None]:
model.wv.vocab

In [None]:
model.most_similar("verenigde_staten")

In [None]:
model.similarity('amerika','verenigde_staten')

In [None]:
model.most_similar('amerika', topn=20)

In [None]:
candidates = ['verenigde_staten','china','japan']
model.most_similar_to_given('amerika', candidates)

In [None]:
for c in candidates:
    print(c, model.similarity('amerika',c))

In [None]:
model.words_closer_than('koud','droog')

In [None]:
import numpy as np
from sklearn.decomposition import PCA

In [None]:
countries = ["china", "rusland", "frankrijk", "duitsland"]
capitals = ["peking","moskou","parijs","berlijn"]
             

In [None]:
X = []

for loc in countries+capitals:
    X.append(model[loc])

In [None]:
pca = PCA(n_components=2)
xy_coords = pca.fit_transform(X)
loc_x, loc_y = zip(*xy_coords)

In [None]:
loc_x

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(16, 8))
ax.scatter(loc_x, loc_y)

for _, location in enumerate(countries+capitals):
    ax.annotate(location, (loc_x[_]+.05, loc_y[_]-.05))

plt.title("Countries and their Capitals")
plt.show()

In [None]:
descriptions = ['sieraden', 'natuurkunde', 'aardig', 'genie', 'leider', 'karakter',  
                'zaken', 'rijk', 'wapen', 'gek']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

she = model['zij'].reshape(1,-1)
he = model['hij'].reshape(1,-1)

for word in descriptions:
    our_vector = model[word].reshape(1,-1)
    print(word+"_she", cosine_similarity(our_vector, she))
    print(word+"_he", cosine_similarity(our_vector, he)) 

In [None]:
model.most_similar_cosmul(positive=['vrouw','professor'], negative=['man'])

In [None]:
model.most_similar_cosmul(positive=['ali','dokter'], negative=['mark'])

In [None]:
model.most_similar_cosmul(positive=['zij','dief'], negative=['hij'])

In [None]:
import json
import random
import numpy as np

import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions
import numpy.linalg as la

In [6]:
# %load debias.py
from __future__ import print_function, division
import we
import json
import numpy as np
import argparse
import sys
if sys.version_info[0] < 3:
    import io
    open = io.open
"""
Hard-debias embedding

Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings
Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai
2016
"""


def debias(E, gender_specific_words, definitional, equalize):
    gender_direction = we.doPCA(definitional, E).components_[0]
    specific_set = set(gender_specific_words)
    for i, w in enumerate(E.words):
        if w not in specific_set:
            E.vecs[i] = we.drop(E.vecs[i], gender_direction)
    E.normalize()
    candidates = {x for e1, e2 in equalize for x in [(e1.lower(), e2.lower()),
                                                     (e1.title(), e2.title()),
                                                     (e1.upper(), e2.upper())]}
    print(candidates)
    for (a, b) in candidates:
        if (a in E.index and b in E.index):
            y = we.drop((E.v(a) + E.v(b)) / 2, gender_direction)
            z = np.sqrt(1 - np.linalg.norm(y)**2)
            if (E.v(a) - E.v(b)).dot(gender_direction) < 0:
                z = -z
            E.vecs[E.index[a]] = z * gender_direction + y
            E.vecs[E.index[b]] = -z * gender_direction + y
    E.normalize()

