In [1]:
from typing import Iterator, List
from gensim import corpora, models
import gensim

In [2]:
def get_tokens(infile: str) -> Iterator[List[str]]:
    '''Yield tokens split on whitespace from each newline terminated string(doc) in file (with newline stripped)
    Parameters
        infile: full path string to file containing strings
    Returns
        each string with newline stripped
    '''
    with open(infile, 'r', encoding='utf-8') as inf:
        for line in inf:
            yield line.strip().split()

In [14]:
# String Constants
TOKENFILE = '../intermediate/titleabstract_tokens.txt'
CLASSFILE = '../intermediate/classifications_ipc.txt'

In [4]:
terms_dict = corpora.Dictionary(get_tokens(TOKENFILE))

In [5]:
terms_dict.save("../models/titleabstract.dict")

In [6]:
def tokens2sparsevecs(infile: str, dictname: gensim.corpora.dictionary.Dictionary):
    return (dictname.doc2bow(tokens) for tokens in get_tokens(infile))

In [7]:
corpora.MmCorpus.serialize('../models/titleabstract_corpus.mm', 
                           tokens2sparsevecs(TOKENFILE, terms_dict), 
                           id2word=terms_dict)

In [8]:
tfidf_model = models.TfidfModel(corpus=tokens2sparsevecs(TOKENFILE, terms_dict), 
                                id2word=terms_dict, 
                                normalize=True)

In [9]:
tfidf_model.save('../models/titleabstract.tfidf_model')

In [10]:
tfidf_sv = tfidf_model[tokens2sparsevecs(TOKENFILE, terms_dict)]

In [11]:
corpora.MmCorpus.serialize('../models/titleabstract_tfidf.mm', 
                           tfidf_sv, 
                           id2word=terms_dict)

**Convert Classification Codes into Vectors**

In [12]:
cl_dict = corpora.Dictionary(get_tokens(CLASSFILE))

In [13]:
cl_dict.save('../models/classifications.dict')

In [15]:
corpora.MmCorpus.serialize('../models/classifications_corpus.mm', 
                           tokens2sparsevecs(CLASSFILE, cl_dict), 
                           id2word=cl_dict)

In [16]:
tfidf_model_cl = models.TfidfModel(corpus=tokens2sparsevecs(CLASSFILE, cl_dict), 
                                id2word=cl_dict, 
                                normalize=True)

In [17]:
tfidf_model_cl.save('../models/classifications.tfidf_model')

In [18]:
tfidf_sv_cl = tfidf_model[tokens2sparsevecs(CLASSFILE, cl_dict)]

In [19]:
corpora.MmCorpus.serialize('../models/classifications_tfidf.mm', 
                           tfidf_sv_cl, 
                           id2word=cl_dict)