<a href="https://colab.research.google.com/github/nicolaiberk/GermanNPEmbs/blob/main/emb_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## estimate word embeddings from newspaper data
## code adapted from https://github.com/damian0604/embeddingworkshop/blob/main/04exercise.ipynb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import string
import re
import os
import pandas as pd
import csv
import sys
import ast
import time


# tqdm allows you to display progress bars in loops
from tqdm import tqdm
from datetime import datetime

import gensim

csv.field_size_limit(sys.maxsize)

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

drivepath = 'drive/MyDrive/Bild/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# get full set of news articles
if not os.path.isfile('newspapers/_bild_articles.csv') and not os.path.isfile(drivepath+'uniquesentences.txt'):
    os.system('mkdir newspapers')
    os.system('wget -O newspapers/articles.zip https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0')
    os.system('unzip newspapers/articles.zip -d newspapers')
    os.system('rm newspapers/articles.zip')

In [4]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # load all texts
  if 'artcls' not in locals():
    for filename in tqdm(os.listdir('newspapers')):
      if 'artcls' in locals():
        print(f'\nLoaded {artcls.shape[0]} articles')
        artcls = artcls.append(pd.read_csv('newspapers/'+filename))
      else:
        artcls = pd.read_csv('newspapers/'+filename)
    print(f'Loaded {artcls.shape[0]} articles, done.')

    artcls = artcls.reset_index()


  # keep only if string
  stringvar = [str == type(i) for i in artcls.text]
  artcls = artcls[stringvar]
  del(stringvar)

  print(artcls.text[0])

In [5]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # subset
  artcls = artcls.text

In [6]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # cut into sentences
  print('\nCutting into sentences:')
  uniquesentences = set()
  trans = str.maketrans('', '', string.punctuation) # translation scheme for removing punctuation
  for review in tqdm(artcls):
    sentences = sent_tokenize(review) 
    for sentence in sentences:
      sent_trans = sentence.translate(trans).lower()
      if sent_trans not in uniquesentences:
        uniquesentences.add(sent_trans)

  del(artcls)

In [7]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  # extract 
  print(f"We now have {len(uniquesentences)} unique sentences.")

In [8]:
if not os.path.isfile(drivepath+'uniquesentences.txt'):
  with open('uniquesentences.txt', 'w') as fo:
    writer = csv.writer(fo)
    for sentence in tqdm(uniquesentences):
      writer.writerow([sentence])

In [10]:
if os.path.isfile(drivepath+'uniquesentences.txt'):
  with open(drivepath+'uniquesentences.txt') as fi:
    uniquesentences = fi.readlines()
  print(f"We now have {len(uniquesentences)} unique sentences.")

We now have 42302049 unique sentences.


In [8]:
tokenizedsentences = (sentence.split() for sentence in uniquesentences) # iterator for vocab definition

In [9]:
inp = drivepath+"uniquesentences.txt"

In [10]:
print(f"Started setting up the model at {datetime.now()}")
model = gensim.models.Word2Vec(size=300, min_count=100, window = 5, workers = 4) # we want 300 dimensions and not overdo it with the features
model.build_vocab(tokenizedsentences)
print(f"Finished vocabulary definition at {datetime.now()}")

2021-08-09 14:21:31,626 : INFO : collecting all words and their counts
2021-08-09 14:21:31,629 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-08-09 14:21:31,731 : INFO : PROGRESS: at sentence #10000, processed 165871 words, keeping 34842 word types


Started setting up the model at 2021-08-09 14:21:31.622141


2021-08-09 14:21:31,863 : INFO : PROGRESS: at sentence #20000, processed 331064 words, keeping 56562 word types
2021-08-09 14:21:31,963 : INFO : PROGRESS: at sentence #30000, processed 499094 words, keeping 74819 word types
2021-08-09 14:21:32,064 : INFO : PROGRESS: at sentence #40000, processed 666376 words, keeping 90933 word types
2021-08-09 14:21:32,165 : INFO : PROGRESS: at sentence #50000, processed 832255 words, keeping 105398 word types
2021-08-09 14:21:32,260 : INFO : PROGRESS: at sentence #60000, processed 999771 words, keeping 118978 word types
2021-08-09 14:21:32,347 : INFO : PROGRESS: at sentence #70000, processed 1167867 words, keeping 131658 word types
2021-08-09 14:21:32,443 : INFO : PROGRESS: at sentence #80000, processed 1335164 words, keeping 143691 word types
2021-08-09 14:21:32,533 : INFO : PROGRESS: at sentence #90000, processed 1501313 words, keeping 155107 word types
2021-08-09 14:21:32,622 : INFO : PROGRESS: at sentence #100000, processed 1667311 words, keeping

Finished vocabulary definition at 2021-08-09 14:30:39.520239


In [None]:
del(uniquesentences)

In [14]:
from gensim.models.word2vec import LineSentence

print(f"Started training at {datetime.now()}")
model.train(LineSentence(inp), total_examples=model.corpus_count,  epochs=5)
print(f"Finished training at {datetime.now()}")

2021-08-09 14:37:48,912 : INFO : training model with 4 workers on 175107 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5


Started training at 2021-08-09 14:37:48.911598


2021-08-09 14:37:49,933 : INFO : EPOCH 1 - PROGRESS: at 0.06% examples, 316279 words/s, in_qsize 7, out_qsize 0
2021-08-09 14:38:02,275 : INFO : EPOCH 1 - PROGRESS: at 0.12% examples, 48567 words/s, in_qsize 4, out_qsize 2
2021-08-09 14:38:03,276 : INFO : EPOCH 1 - PROGRESS: at 0.18% examples, 70202 words/s, in_qsize 6, out_qsize 1
2021-08-09 14:38:04,302 : INFO : EPOCH 1 - PROGRESS: at 0.25% examples, 89388 words/s, in_qsize 7, out_qsize 0
2021-08-09 14:38:05,326 : INFO : EPOCH 1 - PROGRESS: at 0.31% examples, 105695 words/s, in_qsize 5, out_qsize 0
2021-08-09 14:38:06,330 : INFO : EPOCH 1 - PROGRESS: at 0.38% examples, 121151 words/s, in_qsize 7, out_qsize 0
2021-08-09 14:38:07,332 : INFO : EPOCH 1 - PROGRESS: at 0.45% examples, 135759 words/s, in_qsize 6, out_qsize 0
2021-08-09 14:38:08,415 : INFO : EPOCH 1 - PROGRESS: at 0.52% examples, 146615 words/s, in_qsize 6, out_qsize 3
2021-08-09 14:38:09,430 : INFO : EPOCH 1 - PROGRESS: at 0.59% examples, 158383 words/s, in_qsize 7, out_qsi

KeyboardInterrupt: ignored

In [None]:
print('Saving model:')
model.save(drivepath+"np_emb")
print('Model finished!')

In [None]:
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

Assess model validity

In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [None]:
word_vectors.most_similar('flüchtling', topn=10)  # get other similar words