# COVID19 Word2Vec generator

This notebook creates a (Gensim) word2vec model and saves it as a dataset for future use. You could use it for things like looking up synonyms of words in the original documents. It uses my [preprocessed COVID NLP dataset](https://www.kaggle.com/donkeys/covid-nlp-preprocess) as input and produces the [word2vec dataset](https://www.kaggle.com/donkeys/covid-word2vec) as output.

In [1]:
import numpy as np
import pandas as pd

import os

import kaggle_uploader
kaggle_uploader.__version__

'0.3.0'

In [2]:
class COVDoc:
    def __init__(self):
        self.filepath_proc = None
        self.filepath_orig = None
        self.text_proc = None
        self.text_orig = None
        self.tokenized_proc = None
        self.doc_type = None
    
    #this function allows me to lazy-load the original text to save memory
    def load_orig(self):
            with open(doc.filepath_orig) as f:
                d = json.load(f)
                body = ""
                for idx, paragraph in enumerate(d["body_text"]):
                    body += f" {paragraph}"
                self.text_orig = body

In [3]:
import glob, os, json

paragraphs = []

def load_docs(base_path, doc_type):
    file_paths = glob.glob(base_path)
    file_names = [os.path.basename(path) for path in file_paths]
    for filepath in file_paths:
        with open(filepath) as f:
            d = json.load(f)
#            print(d)
            for paragraph in d["body_text"]:
                paragraphs.append(paragraph["text"])
#                paragraphs.append(" ".join(paragraph["text"]).lower())
    return file_names

In [4]:
!ls /kaggle/input/covid-nlp-preprocess

output


In [5]:
!ls /kaggle/input/covid-nlp-preprocess/output/paragraphs/biorxiv_medrxiv | head

0001418189999fea7f7cbe3e82703d71c85a6fe5.json
00016663c74157a66b4d509d5c4edffd5391bbe0.json
002c9e9bed0d874c169d9f77a135f12e41b733ee.json
01213acdd86020357259f2a1094bc43f9bb79796.json
0131ce11f9dbeac6ad5f732ab5d268674da53290.json
014fcb209d3870dce737d4d50e3ec85044cfd2f6.json
01626763ff19226d69dedacfe5fa22f2f0dd0018.json
018fb5e62fbbcae07d57d94d29ac630dcc4dccf9.json
019d4817c1bb20299f7bcd20248bd85ad0f59a2e.json
01b1b409f426cc712ba8e1876d0ac34bab8689e1.json
ls: write error: Broken pipe


In [6]:
!head /kaggle/input/covid-nlp-preprocess/output/paragraphs/biorxiv_medrxiv/00340eea543336d54adda18236424de6a5e91c9d.json | head

head: cannot open '/kaggle/input/covid-nlp-preprocess/output/paragraphs/biorxiv_medrxiv/00340eea543336d54adda18236424de6a5e91c9d.json' for reading: No such file or directory


In [7]:
med_docs = load_docs("/kaggle/input/covid-nlp-preprocess/output/paragraphs/biorxiv_medrxiv/*.json", "medx")
len(med_docs)

2087

In [8]:
comuse_docs = load_docs("/kaggle/input/covid-nlp-preprocess/output/paragraphs/comm_use_subset/*.json", "comuse")
len(comuse_docs)



8682

In [9]:
noncom_docs = load_docs("/kaggle/input/covid-nlp-preprocess/output/paragraphs/noncomm_use_subset/*.json", "noncom")
len(noncom_docs)

2102

In [10]:
custom_docs = load_docs("/kaggle/input/covid-nlp-preprocess/output/paragraphs/custom_license/*.json", "custom")
len(custom_docs)

27073

In [11]:
from gensim.models.word2vec import Word2Vec 

model = Word2Vec(paragraphs, size=300, window=5, min_count=5, workers=4)


In [12]:
#model.wv.vocab
word_vectors = model.wv
#del model



In [13]:
word_vectors.most_similar("patient", topn=50)

[('child', 0.6941933035850525),
 ('inpatient', 0.6390001773834229),
 ('subject', 0.6166149377822876),
 ('hsct_recipient', 0.6060119867324829),
 ('outpatient', 0.6017776727676392),
 ('hct_recipient', 0.5964309573173523),
 ('person', 0.5931017398834229),
 ('admitted_icu', 0.580216646194458),
 ('woman', 0.5711398124694824),
 ('infant', 0.5692756175994873),
 ('icu', 0.5690579414367676),
 ('case', 0.5645661354064941),
 ('nursing_home_resident', 0.5591453313827515),
 ('survivor', 0.5534113645553589),
 ('hcws', 0.5507948398590088),
 ('outpatient_clinic', 0.5468251705169678),
 ('admission', 0.540906548500061),
 ('boy', 0.538764238357544),
 ('casepatients', 0.536649227142334),
 ('hcw', 0.5299692153930664),
 ('episode', 0.5295727252960205),
 ('febrile_neutropenia', 0.5291351079940796),
 ('participant', 0.5226424932479858),
 ('newonset', 0.5219123363494873),
 ('emergency_department_ed', 0.5217392444610596),
 ('pregnant_woman', 0.5201883912086487),
 ('hospitalized', 0.5180360078811646),
 ('girl', 

In [14]:
!mkdir upload_dir

In [15]:
import pickle

with open("upload_dir/word2vec.pickle", "wb") as f:
    pickle.dump(word_vectors, f)

In [16]:
model.save("upload_dir/word2vec.model")

In [17]:
import kaggle_uploader

from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
api_secret = user_secrets.get_secret("kaggle api key")

kaggle_uploader.resources = []
kaggle_uploader.init_on_kaggle("donkeys", api_secret)
kaggle_uploader.base_path = "./upload_dir"
kaggle_uploader.title = "COVID Word2Vec"
kaggle_uploader.dataset_id = "covid-word2vec"
kaggle_uploader.user_id = "donkeys"
kaggle_uploader.add_resource("word2vec.pickle", "pickled word2vec for covid19 dataset")
kaggle_uploader.add_resource("word2vec.model", "gensim saved word2vec for covid19 dataset")
#kaggle_uploader.create()
kaggle_uploader.update("new version")



/kaggle/working/upload_dir
/kaggle/working/upload_dir
running cmd:['kaggle', 'datasets', 'version', '-p', '/kaggle/working/upload_dir', '-m', '"new version"']
Starting upload for file word2vec.model
Upload successful: word2vec.model (16MB)
Starting upload for file word2vec.model.wv.vectors.npy
Upload successful: word2vec.model.wv.vectors.npy (261MB)
Starting upload for file word2vec.pickle
Upload successful: word2vec.pickle (537MB)
Starting upload for file word2vec.model.trainables.syn1neg.npy
Upload successful: word2vec.model.trainables.syn1neg.npy (261MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/donkeys/covid-word2vec



{'title': 'COVID Word2Vec',
 'id': 'donkeys/covid-word2vec',
 'licenses': [{'name': 'CC0-1.0'}],
 'resources': [{'path': '/kaggle/working/upload_dir/word2vec.pickle',
   'description': 'pickled word2vec for covid19 dataset'},
  {'path': '/kaggle/working/upload_dir/word2vec.model',
   'description': 'gensim saved word2vec for covid19 dataset'}]}