In [1]:
## estimate word embeddings from newspaper data
## code adapted from https://github.com/damian0604/embeddingworkshop/blob/main/04exercise.ipynb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import string
import re
import os
import pandas as pd


# tqdm allows you to display progress bars in loops
from tqdm import tqdm
from datetime import datetime

import gensim

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# get full set of news articles
!rm sample_data -r
!mkdir newspapers
!wget -O newspapers/articles.zip https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0
!unzip newspapers/articles.zip -d newspapers
!rm newspapers/articles.zip

rm: cannot remove 'sample_data': No such file or directory
mkdir: cannot create directory ‘newspapers’: File exists
--2021-07-21 11:17:19--  https://www.dropbox.com/sh/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /sh/raw/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa [following]
--2021-07-21 11:17:19--  https://www.dropbox.com/sh/raw/r6k4qk9flgz0agu/AAA5ZLsuOwk9UWiEsLAOFmDSa
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca9a52c38306bc1845c7f1e2b3a.dl.dropboxusercontent.com/zip_download_get/A2EPQsvytsO3ASgEJkkrQvLD6QIxOhemBYiePmfMEt2CJLAEL8AEy0_zqKlDSvW17dOjtX1jCx97gedws4uEwYTksuyXxwls1-jSy0AWCBJHuw# [following]
--2021-07-21 11:17:20--  https://uca9a52c38306bc1845c7f1e2b3a.dl.drop

In [None]:
# load all texts
if 'artcls' in locals():
  del(artcls)

for filename in tqdm(os.listdir('newspapers')):
  if 'artcls' in locals():
    print(f'Loaded {artcls.shape[0]} articles', end = '\r')
    artcls = artcls.append(pd.read_csv('newspapers/'+filename))
  else:
    artcls = pd.read_csv('newspapers/'+filename)
print(f'Loaded {artcls.shape[0]} articles, done.')


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:10<01:43, 10.31s/it][A

Loaded 263266 articles



 18%|█▊        | 2/11 [00:57<03:11, 21.23s/it][A

Loaded 575291 articles



 27%|██▋       | 3/11 [01:22<03:01, 22.64s/it][A

Loaded 794334 articles



 36%|███▋      | 4/11 [01:28<02:03, 17.65s/it][A

Loaded 893180 articles



 45%|████▌     | 5/11 [02:00<02:10, 21.74s/it][A

Loaded 1220020 articles



 55%|█████▍    | 6/11 [02:06<01:26, 17.23s/it][A

Loaded 1285387 articles



 64%|██████▎   | 7/11 [02:20<01:04, 16.16s/it][A

Loaded 1436035 articles


In [None]:
artcls.text[0]

In [None]:
artcls.shape

In [12]:

# check if string
stringvar = [str == type(i) for i in artcls.text]
artcls = artcls[stringvar]

# cut into sentences
trans = str.maketrans('', '', string.punctuation) # translation scheme for removing punctuation
uniquesentences = set()
for review in tqdm(artcls.text):
    for sentence in sent_tokenize(review):
        # remove HTML tags in there
        sentence = re.sub(r"<.*?>"," ",sentence)
        sentence = sentence.translate(trans) 
        if sentence not in uniquesentences:
            uniquesentences.add(sentence.lower())

print(f"We now have {len(uniquesentences)} unique sentences.")

 93%|█████████▎| 239699/257797 [04:31<00:24, 738.78it/s]

KeyboardInterrupt: ignored

In [None]:
# we do not need a list of lists of tokens later on, so let's use a generator instead of a list to save memory
# note that we use round parentheses instead of square brackets to achieve this
# we do need two generators, though, as we first need to build the vocabulary and later need to train.
# If we use a list, we obviously only need once.
tokenizedsentences = (sentence.split() for sentence in uniquesentences)
tokenizedsentences2 = (sentence.split() for sentence in uniquesentences)

In [10]:
print(f"Started setting up the model at {datetime.now()}")
model = gensim.models.Word2Vec(vector_size=300) # we want 300 dimensions
model.build_vocab(tokenizedsentences)
print(f"Started training at {datetime.now()}")
model.train(tokenizedsentences2, total_examples=model.corpus_count,  epochs=1)
# our model gets better if we use more epochs, but we can only do so if we use a list instead of a generator as input
# after all, you can only pass over a generator once.
# model.train(tokenizedsentences2, total_examples=model.corpus_count,  epochs=model.epochs)
print(f"Finished training at {datetime.now()}")

(257797, 6)

In [None]:
model.save("np_emb")