In [1]:
#Run this cell once 

import sys
import os
import codecs
import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

PROJECT_ROOT = '/home/astra/work/projects/lamedoc_new/project/'
if os.path.dirname(PROJECT_ROOT) not in sys.path:
    sys.path.append(os.path.dirname(PROJECT_ROOT))

from src.utils.files_paths import init_paths

# Load language model. Takes time.
nlp = spacy.load('en')

# Helper functions for processing
def is_punct(token):
    """
    Returns True if token is punctuation or space
    """
    return token.is_punct or token.is_space

def line_generator(filename):
    """
    Returns escaped line generator
    """
    with codecs.open(filename, encoding='utf_8') as f:
        for line in f:
            yield line.replace('\\n', '\n')

def text_lemmatizer(filename):
    """
    Lemmatizes the text, and yield sentences
    """
    for parsed_chunk in nlp.pipe(line_generator(filename),batch_size=10000, n_threads=4):
        for sent in parsed_chunk.sents:
            yield ' '.join([token.lemma_ for token in sent if not is_punct(token)])



In [2]:
# Define raw text file
from settings import settings
RAW_TEXT_FILE_NAME = 'sec_wiki_books_requests.txt'
raw_text_file = os.path.join(settings.RAW_DATA_PATH, RAW_TEXT_FILE_NAME)

# Define ngrams paths
unigram_text_filepath = init_paths(RAW_TEXT_FILE_NAME)['unigram_text_filepath']
bigram_model_filepath = init_paths(RAW_TEXT_FILE_NAME)['bigram_model_filepath']
bigram_text_filepath = init_paths(RAW_TEXT_FILE_NAME)['bigram_text_filepath']
trigram_model_filepath = init_paths(RAW_TEXT_FILE_NAME)['trigram_model_filepath']
trigram_text_filepath = init_paths(RAW_TEXT_FILE_NAME)['trigram_text_filepath']
normalized_text_filepath = init_paths(RAW_TEXT_FILE_NAME)['normalized_text_filepath']

In [3]:
# Loads RAW_TEXT_FILE_NAME and save as unigram text.
if True:
    with codecs.open(unigram_text_filepath, 'w', encoding='utf_8') as f:
        for sentence in text_lemmatizer(raw_text_file):
            f.write(sentence + '\n')

In [4]:
unigram_text = LineSentence(unigram_text_filepath)

In [5]:
# Generating bigram model
if True:
    bigram_model = Phrases(unigram_text)
    bigram_model.save(bigram_model_filepath)

In [6]:
from gensim.models.phrases import Phraser
# Load bigram model
bigram_model = Phraser.load(bigram_model_filepath)

In [7]:
# Making bigram text.
if True:
    with codecs.open(bigram_text_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in unigram_text:
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')



In [8]:
bigram_text = LineSentence(bigram_text_filepath)

In [9]:
# Generating trigram.
if True:
    trigram_model = Phrases(bigram_text)
    trigram_model.save(trigram_model_filepath)

In [10]:
# Load trigram model
trigram_model = Phraser.load(trigram_model_filepath)

In [11]:
# Generating trigram text
if True:
    with codecs.open(trigram_text_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_text:
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')



In [12]:
trigram_text = LineSentence(trigram_text_filepath)

In [13]:
# Generating normalized text
if True:
    with codecs.open(normalized_text_filepath, 'w', encoding='utf_8') as f:
        for parsed_text in nlp.pipe(line_generator(raw_text_file),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_text = [token.lemma_ for token in parsed_text
                              if not is_punct(token)]
            #print("unigram")
            
            # apply the first-order and second-order phrase models
            bigram_text = bigram_model[unigram_text]
            trigram_text = trigram_model[bigram_text]
            
            # remove any remaining stopwords
            trigram_text = [term for term in trigram_text
                              if term not in spacy.en.language_data.STOP_WORDS]
            #print("trigram")
            
            # write the transformed review as a line in the new file
            trigram_text = ' '.join(trigram_text)
            f.write(trigram_text + '\n')

