<a href="https://colab.research.google.com/github/nicolaiberk/GermanNPEmbs/blob/main/emb_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## estimate word embeddings from newspaper data
## code adapted from https://github.com/damian0604/embeddingworkshop/blob/main/04exercise.ipynb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import string
import re
import os
import pandas as pd
import csv
import sys
import ast
import time


# tqdm allows you to display progress bars in loops
from tqdm import tqdm
from datetime import datetime

import gensim

csv.field_size_limit(sys.maxsize)

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

drivepath = 'drive/MyDrive/Bild/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# get full set of news articles
if not os.path.isfile('newspapers/_bild_articles.csv') and not os.path.isfile(drivepath+'uniquesentences.txt'):
    os.system('mkdir newspapers')
    os.system('wget -O newspapers/articles.zip https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0')
    os.system('unzip newspapers/articles.zip -d newspapers')
    os.system('rm newspapers/articles.zip')

In [4]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # load all texts
  if 'artcls' not in locals():
    for filename in tqdm(os.listdir('newspapers')):
      if 'artcls' in locals():
        print(f'\nLoaded {artcls.shape[0]} articles')
        artcls = artcls.append(pd.read_csv('newspapers/'+filename))
      else:
        artcls = pd.read_csv('newspapers/'+filename)
    print(f'Loaded {artcls.shape[0]} articles, done.')

    artcls = artcls.reset_index()


  # keep only if string
  stringvar = [str == type(i) for i in artcls.text]
  artcls = artcls[stringvar]
  del(stringvar)

  print(artcls.text[0])

In [5]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # subset
  artcls = artcls.text

In [6]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # cut into sentences
  print('\nCutting into sentences:')
  uniquesentences = set()
  trans = str.maketrans('', '', string.punctuation) # translation scheme for removing punctuation
  for review in tqdm(artcls):
    sentences = sent_tokenize(review) 
    for sentence in sentences:
      sent_trans = sentence.translate(trans).lower()
      if sent_trans not in uniquesentences:
        uniquesentences.add(sent_trans)

  del(artcls)

In [7]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # extract 
  print(f"We now have {len(uniquesentences)} unique sentences.")

In [8]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  with open('uniquesentences.txt', 'w') as fo:
    writer = csv.writer(fo)
    for sentence in tqdm(uniquesentences):
      writer.writerow([sentence])

In [9]:
if os.path.isfile(drivepath+'uniquesentences.txt'):
  with open(drivepath+'uniquesentences.txt') as fi:
    uniquesentences = fi.readlines()
  print(f"We now have {len(uniquesentences)} unique sentences.")

We now have 42302049 unique sentences.


In [10]:
tokenizedsentences = (sentence.split() for sentence in uniquesentences) # iterator for vocab definition

In [11]:
inp = drivepath+"uniquesentences.txt"

In [12]:
print(f"Started setting up the model at {datetime.now()}")
model = gensim.models.Word2Vec(size=300, min_count=100, window = 5, workers = 4) # we want 300 dimensions and not overdo it with the features
model.build_vocab(tokenizedsentences)
print(f"Finished vocabulary definition at {datetime.now()}")

2021-08-09 18:35:59,982 : INFO : collecting all words and their counts
2021-08-09 18:35:59,984 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-08-09 18:36:00,074 : INFO : PROGRESS: at sentence #10000, processed 165872 words, keeping 34843 word types


Started setting up the model at 2021-08-09 18:35:59.980693


2021-08-09 18:36:00,173 : INFO : PROGRESS: at sentence #20000, processed 331065 words, keeping 56563 word types
2021-08-09 18:36:00,271 : INFO : PROGRESS: at sentence #30000, processed 499095 words, keeping 74820 word types
2021-08-09 18:36:00,366 : INFO : PROGRESS: at sentence #40000, processed 666377 words, keeping 90934 word types
2021-08-09 18:36:00,461 : INFO : PROGRESS: at sentence #50000, processed 832256 words, keeping 105399 word types
2021-08-09 18:36:00,558 : INFO : PROGRESS: at sentence #60000, processed 999772 words, keeping 118979 word types
2021-08-09 18:36:00,642 : INFO : PROGRESS: at sentence #70000, processed 1167804 words, keeping 131659 word types
2021-08-09 18:36:00,737 : INFO : PROGRESS: at sentence #80000, processed 1335111 words, keeping 143689 word types
2021-08-09 18:36:00,828 : INFO : PROGRESS: at sentence #90000, processed 1501279 words, keeping 155108 word types
2021-08-09 18:36:00,919 : INFO : PROGRESS: at sentence #100000, processed 1667251 words, keeping

Finished vocabulary definition at 2021-08-09 18:44:23.010471


In [13]:
del(uniquesentences)

In [14]:
from gensim.models.word2vec import LineSentence

print(f"Started training at {datetime.now()}")
model.train(LineSentence(inp), total_examples=model.corpus_count,  epochs=5)
print(f"Finished training at {datetime.now()}")

2021-08-09 18:44:27,095 : INFO : training model with 4 workers on 175107 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5


Started training at 2021-08-09 18:44:27.094828


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
2021-08-09 19:23:10,414 : INFO : EPOCH 2 - PROGRESS: at 55.69% examples, 374837 words/s, in_qsize 6, out_qsize 1
2021-08-09 19:23:11,434 : INFO : EPOCH 2 - PROGRESS: at 55.77% examples, 374885 words/s, in_qsize 7, out_qsize 0
2021-08-09 19:23:12,435 : INFO : EPOCH 2 - PROGRESS: at 55.84% examples, 374913 words/s, in_qsize 7, out_qsize 0
2021-08-09 19:23:13,479 : INFO : EPOCH 2 - PROGRESS: at 55.92% examples, 374940 words/s, in_qsize 7, out_qsize 1
2021-08-09 19:23:14,497 : INFO : EPOCH 2 - PROGRESS: at 55.99% examples, 374990 words/s, in_qsize 6, out_qsize 1
2021-08-09 19:23:15,509 : INFO : EPOCH 2 - PROGRESS: at 56.07% examples, 375050 words/s, in_qsize 7, out_qsize 0
2021-08-09 19:23:16,512 : INFO : EPOCH 2 - PROGRESS: at 56.15% examples, 375105 words/s, in_qsize 8, out_qsize 0
2021-08-09 19:23:17,539 : INFO : EPOCH 2 - PROGRESS: at 56.22% examples, 375140 words/s, in_qsize 7, out_qsize 0
2021-08-09 19:2

Finished training at 2021-08-09 20:47:49.419290


In [15]:
print('Saving model:')
model.save(drivepath+"np_emb")
print('Model finished!')

2021-08-09 20:47:49,576 : INFO : saving Word2Vec object under drive/MyDrive/Bild/np_emb, separately None
2021-08-09 20:47:49,584 : INFO : storing np array 'vectors' to drive/MyDrive/Bild/np_emb.wv.vectors.npy


Saving model:


2021-08-09 20:47:51,228 : INFO : not storing attribute vectors_norm
2021-08-09 20:47:51,230 : INFO : storing np array 'syn1neg' to drive/MyDrive/Bild/np_emb.trainables.syn1neg.npy
2021-08-09 20:47:53,564 : INFO : not storing attribute cum_table
2021-08-09 20:47:54,312 : INFO : saved drive/MyDrive/Bild/np_emb


Model finished!


In [16]:
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

2021-08-09 20:47:54,325 : INFO : saving Word2VecKeyedVectors object under word2vec.wordvectors, separately None
2021-08-09 20:47:54,328 : INFO : storing np array 'vectors' to word2vec.wordvectors.vectors.npy
2021-08-09 20:47:54,796 : INFO : not storing attribute vectors_norm
2021-08-09 20:47:55,282 : INFO : saved word2vec.wordvectors


Assess model validity

In [17]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

2021-08-09 20:47:55,298 : INFO : loading Word2VecKeyedVectors object from word2vec.wordvectors
2021-08-09 20:47:56,362 : INFO : loading vectors from word2vec.wordvectors.vectors.npy with mmap=r
2021-08-09 20:47:56,366 : INFO : setting ignored attribute vectors_norm to None
2021-08-09 20:47:56,368 : INFO : loaded word2vec.wordvectors


In [19]:
word_vectors.most_similar('flüchtling', topn=20)  # get other similar words

[('kriegsflüchtling', 0.7599212527275085),
 ('migrant', 0.7159389853477478),
 ('asylsuchender', 0.6973713040351868),
 ('afghane', 0.669090747833252),
 ('syrer', 0.6585391759872437),
 ('asylbewerber', 0.6557232141494751),
 ('häftling', 0.6416419744491577),
 ('flüchtlingskind', 0.6333128213882446),
 ('kurde', 0.6098465919494629),
 ('eritreer', 0.6053134202957153),
 ('dschihadist', 0.588051438331604),
 ('islamist', 0.5863703489303589),
 ('soldat', 0.5796210765838623),
 ('tourist', 0.5790219902992249),
 ('muslim', 0.5779893398284912),
 ('terrorist', 0.575116753578186),
 ('iraker', 0.5727214813232422),
 ('bundeswehrsoldat', 0.5690356492996216),
 ('türke', 0.5672888159751892),
 ('passagier', 0.5633428692817688)]