<a href="https://colab.research.google.com/github/nicolaiberk/GermanNPEmbs/blob/main/emb_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
## estimate word embeddings from newspaper data
## code adapted from https://github.com/damian0604/embeddingworkshop/blob/main/04exercise.ipynb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import string
import re
import os
import pandas as pd
import csv
import sys
import ast
import time


# tqdm allows you to display progress bars in loops
from tqdm import tqdm
from datetime import datetime

import gensim

csv.field_size_limit(sys.maxsize)

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

drivepath = 'drive/MyDrive/Bild/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# get full set of news articles
if not os.path.isfile('newspapers/_bild_articles.csv') and not os.path.isfile(drivepath+'uniquesentences.txt'):
    os.system('mkdir newspapers')
    os.system('wget -O newspapers/articles.zip https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0')
    os.system('unzip newspapers/articles.zip -d newspapers')
    os.system('rm newspapers/articles.zip')

In [None]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # load all texts
  if 'artcls' not in locals():
    for filename in tqdm(os.listdir('newspapers')):
      if 'artcls' in locals():
        print(f'\nLoaded {artcls.shape[0]} articles')
        artcls = artcls.append(pd.read_csv('newspapers/'+filename))
      else:
        artcls = pd.read_csv('newspapers/'+filename)
    print(f'Loaded {artcls.shape[0]} articles, done.')

    artcls = artcls.reset_index()


  # keep only if string
  stringvar = [str == type(i) for i in artcls.text]
  artcls = artcls[stringvar]
  del(stringvar)

  print(artcls.text[0])

In [None]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # subset
  artcls = artcls.text

In [None]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # cut into sentences
  print('\nCutting into sentences:')
  uniquesentences = set()
  trans = str.maketrans('', '', string.punctuation) # translation scheme for removing punctuation
  for review in tqdm(artcls):
    sentences = sent_tokenize(review) 
    for sentence in sentences:
      sent_trans = sentence.translate(trans).lower()
      if sent_trans not in uniquesentences:
        uniquesentences.add(sent_trans)

  del(artcls)

In [None]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # extract 
  print(f"We now have {len(uniquesentences)} unique sentences.")

In [None]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  with open('uniquesentences.txt', 'w') as fo:
    writer = csv.writer(fo)
    for sentence in tqdm(uniquesentences):
      writer.writerow([sentence])

In [None]:
if os.path.isfile(drivepath+'uniquesentences.txt'):
  with open(drivepath+'uniquesentences.txt') as fi:
    uniquesentences = fi.readlines()
  print(f"We now have {len(uniquesentences)} unique sentences.")

We now have 42302049 unique sentences.


In [None]:
tokenizedsentences = (sentence.split() for sentence in uniquesentences) # iterator for vocab definition

In [None]:
inp = drivepath+"uniquesentences.txt"

In [None]:
print(f"Started setting up the model at {datetime.now()}")
model = gensim.models.Word2Vec(size=300, min_count=100, window = 5, workers = 4) # we want 300 dimensions and not overdo it with the features
model.build_vocab(tokenizedsentences)
print(f"Finished vocabulary definition at {datetime.now()}")

In [None]:
del(uniquesentences)

In [None]:
from gensim.models.word2vec import LineSentence

print(f"Started training at {datetime.now()}")
model.train(LineSentence(inp), total_examples=model.corpus_count,  epochs=5)
print(f"Finished training at {datetime.now()}")

In [None]:
print('Saving model:')
model.save(drivepath+"np_emb")
print('Model finished!')

2021-08-09 20:47:49,576 : INFO : saving Word2Vec object under drive/MyDrive/Bild/np_emb, separately None
2021-08-09 20:47:49,584 : INFO : storing np array 'vectors' to drive/MyDrive/Bild/np_emb.wv.vectors.npy


Saving model:


2021-08-09 20:47:51,228 : INFO : not storing attribute vectors_norm
2021-08-09 20:47:51,230 : INFO : storing np array 'syn1neg' to drive/MyDrive/Bild/np_emb.trainables.syn1neg.npy
2021-08-09 20:47:53,564 : INFO : not storing attribute cum_table
2021-08-09 20:47:54,312 : INFO : saved drive/MyDrive/Bild/np_emb


Model finished!


In [4]:
# Store just the words + their trained embeddings.
model = gensim.models.Word2Vec.load(drivepath+"np_emb")
word_vectors = model.wv
word_vectors.save(drivepath+"word2vec.wordvectors")

2021-08-10 09:51:10,824 : INFO : loading Word2Vec object from drive/MyDrive/Bild/np_emb
2021-08-10 09:51:12,687 : INFO : loading wv recursively from drive/MyDrive/Bild/np_emb.wv.* with mmap=None
2021-08-10 09:51:12,689 : INFO : loading vectors from drive/MyDrive/Bild/np_emb.wv.vectors.npy with mmap=None
2021-08-10 09:51:15,523 : INFO : setting ignored attribute vectors_norm to None
2021-08-10 09:51:15,524 : INFO : loading vocabulary recursively from drive/MyDrive/Bild/np_emb.vocabulary.* with mmap=None
2021-08-10 09:51:15,526 : INFO : loading trainables recursively from drive/MyDrive/Bild/np_emb.trainables.* with mmap=None
2021-08-10 09:51:15,527 : INFO : loading syn1neg from drive/MyDrive/Bild/np_emb.trainables.syn1neg.npy with mmap=None
2021-08-10 09:51:18,484 : INFO : setting ignored attribute cum_table to None
2021-08-10 09:51:18,485 : INFO : loaded drive/MyDrive/Bild/np_emb
2021-08-10 09:51:18,953 : INFO : saving Word2VecKeyedVectors object under drive/MyDrive/Bild/word2vec.wordve

Assess model validity

In [6]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load(drivepath+"word2vec.wordvectors", mmap='r')

2021-08-10 09:51:40,140 : INFO : loading Word2VecKeyedVectors object from drive/MyDrive/Bild/word2vec.wordvectors
2021-08-10 09:51:40,641 : INFO : loading vectors from drive/MyDrive/Bild/word2vec.wordvectors.vectors.npy with mmap=r
2021-08-10 09:51:40,662 : INFO : setting ignored attribute vectors_norm to None
2021-08-10 09:51:40,663 : INFO : loaded drive/MyDrive/Bild/word2vec.wordvectors


In [10]:
word_vectors.most_similar('flüchtling', topn=10)  # get other similar words

[('kriegsflüchtling', 0.7599212527275085),
 ('migrant', 0.7159389853477478),
 ('asylsuchender', 0.6973713040351868),
 ('afghane', 0.669090747833252),
 ('syrer', 0.6585391759872437),
 ('asylbewerber', 0.6557232141494751),
 ('häftling', 0.6416419744491577),
 ('flüchtlingskind', 0.6333128213882446),
 ('kurde', 0.6098465919494629),
 ('eritreer', 0.6053134202957153)]

In [11]:
word_vectors.most_similar('immigration', topn=10)

[('einwanderung', 0.8021354079246521),
 ('migration', 0.7246120572090149),
 ('zuwanderung', 0.670327365398407),
 ('einwanderung“', 0.6148055791854858),
 ('migration“', 0.6068947315216064),
 ('arbeitsmigration', 0.5892990827560425),
 ('masseneinwanderung', 0.5891662836074829),
 ('sekundärmigration', 0.5775723457336426),
 ('armutsmigration', 0.5755099058151245),
 ('immigranten', 0.5382981896400452)]

In [13]:
word_vectors.most_similar(positive=["frau","könig"],negative=["mann"])

[('königin', 0.6508051156997681),
 ('gemahlin', 0.6460778713226318),
 ('prinzessin', 0.6355438232421875),
 ('gattin', 0.6281205415725708),
 ('kaiserin', 0.5999823808670044),
 ('mätresse', 0.5790205597877502),
 ('fürstin', 0.5617177486419678),
 ('hofdame', 0.5549862384796143),
 ('kronprinzessin', 0.5427185297012329),
 ('ehefrau', 0.5426924228668213)]

In [17]:
word_vectors.most_similar(positive=["sie","arzt"],negative=["er"]) # surprisingly unbiased on gender

[('gynäkologen', 0.6619372367858887),
 ('frauenarzt', 0.6503668427467346),
 ('hausarzt', 0.647055983543396),
 ('kinderarzt', 0.6321763396263123),
 ('urologen', 0.6317317485809326),
 ('ärzte', 0.6316326856613159),
 ('orthopäden', 0.629375696182251),
 ('mediziner', 0.6273952722549438),
 ('therapeuten', 0.6148329973220825),
 ('kardiologen', 0.6140260696411133)]

In [18]:
word_vectors.most_similar(positive=["frau","arzt"],negative=["mann"]) # surprisingly unbiased on gender

[('ärztin', 0.7049057483673096),
 ('therapeutin', 0.6990090608596802),
 ('gynäkologin', 0.6784191727638245),
 ('patientin', 0.6637605428695679),
 ('hebamme', 0.6561906933784485),
 ('frauenärztin', 0.6545000672340393),
 ('kinderärztin', 0.6402939558029175),
 ('zahnärztin', 0.6391436457633972),
 ('hausärztin', 0.634697675704956),
 ('pflegerin', 0.63043212890625)]

In [16]:
word_vectors.most_similar(positive=["marokkaner","polizist"],negative=["deutscher"])

[('mann', 0.6420572996139526),
 ('messerstecher', 0.6257760524749756),
 ('afghane', 0.6227939128875732),
 ('angeklagte', 0.6134068965911865),
 ('algerier', 0.611687421798706),
 ('wachmann', 0.6098835468292236),
 ('sicherheitsmann', 0.6046900749206543),
 ('tunesier', 0.5988072156906128),
 ('taxifahrer', 0.5979149341583252),
 ('eritreer', 0.5968226194381714)]