In [1]:
from typing import Iterator, List
import gensim
from gensim import corpora, models
import smart_open

In [22]:
# String Constants for file path names
TOKENFILE = 's3://pto-us-data/text-data/titleabstract_tokens.txt'
TERMDICTFILE = 's3://pto-us-data/models/titleabstract.dict'
TERMCORPUSFILE = 's3://pto-us-data/models/titleabstract_corpus.mm'
TERMTFIDFMODELFILE = 's3://pto-us-data/models/titleabstract.tfidf_model'
TERMTFIDFFILE = 's3://pto-us-data/models/titleabstract_tfidf.mm'
CLASSFILE = '../intermediate/classifications_ipc.txt'
CLASSDICTFILE = '../models/classifications.dict'
CLASSCORPUSFILE = '../models/classifications_corpus.mm'
CLASSTFIDFMODELFILE = '../models/classifications.tfidf_model'
CLASSTFIDFFILE = '../models/classifications_tfidf.mm'

In [3]:
def get_tokens(infile: str) -> Iterator[List[str]]:
    '''Yield tokens split on whitespace from each newline terminated string(doc) in file (with newline stripped)
    Parameters
        infile: full path string to file containing strings
    Returns
        each string with newline stripped
    '''
    with smart_open.smart_open(infile) as inf:
        for line in inf:
            yield line.decode('utf-8').strip().split()

In [4]:
def tokens2sparsevecs(infile: str, dictname: gensim.corpora.dictionary.Dictionary):
    return (dictname.doc2bow(tokens) for tokens in get_tokens(infile))

In [5]:
def save_corpus(filename: str, vecs, dictname: gensim.corpora.dictionary.Dictionary) -> None:
    '''Save a corpus of vectors in Matrix Market format
    Parameters
        filename: full path to file to save
        vecs: corpus to save
        dictname: gensim dictionary with id, word pairs
    '''
    corpora.MmCorpus.serialize(filename, vecs, id2word=dictname)

In [6]:
terms_dict = corpora.Dictionary(get_tokens(TOKENFILE))

In [13]:
terms_dict.save(TERMDICTFILE)

In [15]:
save_corpus(TERMCORPUSFILE, 
            tokens2sparsevecs(TOKENFILE, terms_dict), 
            terms_dict)

NotImplementedError: mode 'wb+' not implemented for S3

In [16]:
tfidf_model = models.TfidfModel(corpus=tokens2sparsevecs(TOKENFILE, terms_dict), 
                                id2word=terms_dict, 
                                normalize=True)

In [19]:
tfidf_model.save(TERMTFIDFMODELFILE)

In [20]:
tfidf_sv = tfidf_model[tokens2sparsevecs(TOKENFILE, terms_dict)]

In [23]:
save_corpus(TERMTFIDFFILE, 
            tfidf_sv, 
            terms_dict)

NotImplementedError: mode 'wb+' not implemented for S3

**Convert Classification Codes into Vectors**

In [13]:
cl_dict = corpora.Dictionary(get_tokens(CLASSFILE))

In [14]:
cl_dict.save(CLASSDICTFILE)

In [15]:
save_corpus(CLASSCORPUSFILE, 
            tokens2sparsevecs(CLASSFILE, cl_dict),
            cl_dict)

In [16]:
tfidf_model_cl = models.TfidfModel(corpus=tokens2sparsevecs(CLASSFILE, cl_dict), 
                                   id2word=cl_dict, 
                                   normalize=True)

In [17]:
tfidf_model_cl.save(CLASSTFIDFMODELFILE)

In [18]:
tfidf_sv_cl = tfidf_model[tokens2sparsevecs(CLASSFILE, cl_dict)]

In [19]:
save_corpus(CLASSTFIDFFILE,
            tfidf_sv_cl,
            cl_dict)