In [1]:
## estimate word embeddings from newspaper data
## code adapted from https://github.com/damian0604/embeddingworkshop/blob/main/04exercise.ipynb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import string
import re
import os
import pandas as pd


# tqdm allows you to display progress bars in loops
from tqdm import tqdm
from datetime import datetime

import gensim

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# get full set of news articles
!rm sample_data -r
!mkdir newspapers
!wget -O newspapers/articles.zip https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0
!unzip newspapers/articles.zip -d newspapers
!rm newspapers/articles.zip

--2021-07-21 11:39:46--  https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /sh/raw/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa [following]
--2021-07-21 11:39:46--  https://www.dropbox.com/sh/raw/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc61d53532e3c4c0e875eaf0c351.dl.dropboxusercontent.com/zip_download_get/A2Fc0D8YvMcR4I0Gt-Gr2_Cxe15W_-nJpwg4zXuhx2TLW6bGPERJWupAuT58O1JZVZ-QNo-EjEgGR0TlopF4bf-ZJ6g-drSFk-T6weF4gnpZzw# [following]
--2021-07-21 11:39:47--  https://uc61d53532e3c4c0e875eaf0c351.dl.dropboxusercontent.com/zip_download_get/A2Fc0D8YvMcR4I0Gt-Gr2_Cxe15W_-nJpwg4zXuhx2TLW6bGPERJWupAuT58O1JZVZ-QNo-EjEgGR

In [3]:
# load all texts
if 'artcls' in locals():
  del(artcls)

for filename in tqdm(os.listdir('newspapers')):
  if 'artcls' in locals():
    print(f'\nLoaded {artcls.shape[0]} articles')
    artcls = artcls.append(pd.read_csv('newspapers/'+filename))
  else:
    artcls = pd.read_csv('newspapers/'+filename)
print(f'Loaded {artcls.shape[0]} articles, done.')

  9%|▉         | 1/11 [00:06<01:09,  6.97s/it]


Loaded 263266 articles


 18%|█▊        | 2/11 [00:19<01:17,  8.57s/it]


Loaded 575291 articles


 27%|██▋       | 3/11 [00:26<01:05,  8.22s/it]


Loaded 794334 articles


 36%|███▋      | 4/11 [00:28<00:44,  6.40s/it]


Loaded 893180 articles


 45%|████▌     | 5/11 [00:37<00:42,  7.05s/it]


Loaded 1220020 articles


 55%|█████▍    | 6/11 [00:39<00:28,  5.60s/it]


Loaded 1285387 articles


 64%|██████▎   | 7/11 [00:47<00:25,  6.29s/it]


Loaded 1436035 articles


 73%|███████▎  | 8/11 [00:49<00:14,  4.97s/it]


Loaded 1472453 articles


 82%|████████▏ | 9/11 [00:51<00:08,  4.25s/it]


Loaded 1545861 articles


 91%|█████████ | 10/11 [00:53<00:03,  3.41s/it]


Loaded 1634513 articles


100%|██████████| 11/11 [01:08<00:00,  6.24s/it]

Loaded 2474182 articles, done.





In [4]:
artcls = artcls.reset_index()
artcls.text[0]

'was fehlt ... ... der Vorsatz    Der Vorsatz hat keinen guten Klang: Mit Vorsatz gehandelt zu haben, wird einem meist im Gericht vorgeworfen. Auch die guten Vorsätze haben immer den unangenehmen Beigeschmack von Schuld und Sühne - nicht zuletzt, weil sie zu 90 Prozent gebrochen werden. Woher der Brauch kommt, sich im neuen Jahr eine Änderung des Verhaltens vorzunehmen, ist unklar.    Am wahrscheinlichsten ist ein christlicher Ursprung, wie bei vielen Festtagsbräuchen - immerhin stammt das Wort Silvester vom Namenstag des Papstes Silvester (lateinisch für "Waldmensch"), der am 31. Dezember 335 starb. Möglicherweise sind die guten Vorsätze also eine katholische Erfindung: Die Sünden werden vergeben, aber nur, wenn man Besserung gelobt.    Die Wortherkunft der guten Vorsätze ist leichter zu bestimmen: Die Wurzel des Guten liegt im germanischen "goda" (passend, geeignet), das sich im 8. Jahrhundert zu "guot" (Besitz, Vermögen) weiterentwickelte. Vorsätze hießen im Mittelhochdeutschen "vür

In [5]:
artcls.shape

(2474182, 11)

In [None]:
# check if string
stringvar = [str == type(i) for i in artcls.text]
artcls = artcls[stringvar]

# cut into sentences
trans = str.maketrans('', '', string.punctuation) # translation scheme for removing punctuation
uniquesentences = set()
for review in tqdm(artcls.text):
    for sentence in sent_tokenize(review):
        # remove HTML tags in there
        sentence = re.sub(r"<.*?>"," ",sentence)
        sentence = sentence.translate(trans) 
        if sentence not in uniquesentences:
            uniquesentences.add(sentence.lower())

print(f"We now have {len(uniquesentences)} unique sentences.")

 15%|█▌        | 341824/2214853 [07:07<48:40, 641.26it/s]

In [None]:
del(artcls)

In [None]:
# we do not need a list of lists of tokens later on, so let's use a generator instead of a list to save memory
# note that we use round parentheses instead of square brackets to achieve this
# we do need two generators, though, as we first need to build the vocabulary and later need to train.
# If we use a list, we obviously only need once.
tokenizedsentences = (sentence.split() for sentence in uniquesentences)
tokenizedsentences2 = (sentence.split() for sentence in uniquesentences)

In [10]:
print(f"Started setting up the model at {datetime.now()}")
model = gensim.models.Word2Vec(vector_size=300) # we want 300 dimensions
model.build_vocab(tokenizedsentences)
print(f"Started training at {datetime.now()}")
model.train(tokenizedsentences2, total_examples=model.corpus_count,  epochs=1)
# our model gets better if we use more epochs, but we can only do so if we use a list instead of a generator as input
# after all, you can only pass over a generator once.
# model.train(tokenizedsentences2, total_examples=model.corpus_count,  epochs=model.epochs)
print(f"Finished training at {datetime.now()}")

(257797, 6)

In [None]:
model.save("np_emb")