# Simple NLP procedures 

This procedures includes:
1. Text files parsing.
2. Text files cleaning.
3. Text lemmatization.

In [1]:
import re
import os
import codecs
import spacy
import pandas as pd
import itertools as it

import settings

In [2]:
# Load spacy. Takes time. Spacy yet has a model of english languge. 
nlp = spacy.load('en')

In [4]:
# Define raw text file
RAW_TEXT_FILE_NAME = 'all_clean_one_line.txt'
raw_text_file = os.path.join(settings.RAW_DATA_PATH, RAW_TEXT_FILE_NAME)

In [None]:
# Load raw text file
with codecs.open(raw_text_file, 'r', encoding='utf-8') as f:
    sample_test = list(it.islice(f,0,500))[0:500]
    sample_test = ' '.join(sample_test)
    sample_test = re.sub('[\n \t]+',' ', sample_test)
    sample_test = re.sub('\x0c2', '', sample_test)

#print(sample_test)

In [None]:
# Parse text with spacy nlp

parsed_text = nlp(sample_test)

In [None]:
# Detect sentences and enumerate them
if False:
    for num, sentence in enumerate(parsed_text.sents):
        print('<Sentence {}>:'.format(num + 1))
        print(sentence)
        print('')

In [None]:
# Entity detection
if False:
    for num, entity in enumerate(parsed_text.ents):
        print('<Entity {}>:'.format(num + 1), entity, '-', entity.label_)
        print('')

In [None]:
# POS tagging
token_text = [token.orth_ for token in parsed_text]
token_pos = [token.pos_ for token in parsed_text]

In [None]:
# Convert to data frame
pd.DataFrame(list(zip(token_text, token_pos)), columns=['token', 'pos'])

In [None]:
# Normalization lemmatization
token_lemma = [token.lemma_ for token in parsed_text]
token_shape = [token.shape_ for token in parsed_text]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)), columns=['text', 'lemma', 'shape'])

In [None]:
# Token levels analysis
token_entity_type = [token.ent_type_ for token in parsed_text]
token_entity_iob = [token.ent_iob_ for token in parsed_text]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

In [None]:
# Common statistics
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_text]

# Convert to panda's DataFrame
attributes_df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

attributes_df.loc[:, 'stop?':'out of vocab.?'] = (attributes_df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: 'Yes' if x else ''))

# Show attributes
attributes_df

### High level text processing
Colocations and phrases detection
It is very important to subdivide text to sentences. Colocations and phrases do not cross the borders of sentences.

In [5]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

#import settings 

unigram_text_filepath = os.path.join(settings.NGRAMS_DATA_PATH, 'unigram_text.txt')
bigram_model_filepath = os.path.join(settings.NGRAMS_DATA_PATH, 'bigram_model')
bigram_text_filepath = os.path.join(settings.NGRAMS_DATA_PATH, 'bigram_text.txt')
trigram_model_filepath = os.path.join(settings.NGRAMS_DATA_PATH, 'trigram_model')
trigram_text_filepath = os.path.join(settings.NGRAMS_DATA_PATH, 'trigram_text.txt')

normalized_text_filepath = os.path.join(settings.NORMALIZED_DATA_PATH, 'normalized_text.txt')

In [6]:
# Helper functions for processing
def is_punct(token):
    """
    Returns True if token is punctuation or space
    """
    return token.is_punct or token.is_space

def line_generator(filename):
    """
    Returns escaped line generator
    """
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def text_lemmatizer(filename):
    """
    Lemmatizes the text, and yield sentences
    """
    for parsed_chunk in nlp.pipe(line_generator(filename),batch_size=10000, n_threads=4):
        for sent in parsed_chunk.sents:
            yield ' '.join([token.lemma_ for token in sent if not is_punct(token)])

In [None]:
# Loads RAW_TEXT_FILE_NAME and save as unigram text.
if True:
    with codecs.open(unigram_text_filepath, 'w', encoding='utf_8') as f:
        for sentence in text_lemmatizer(raw_text_file):
            f.write(sentence + '\n')

In [7]:
unigram_text = LineSentence(unigram_text_filepath)

In [None]:
# Generating bigram model
if True:
    bigram_model = Phrases(unigram_text)
    bigram_model.save(bigram_model_filepath)

In [8]:
# Load bigram model
bigram_model = Phrases.load(bigram_model_filepath)

In [None]:
# Making bigram text.
if True:
    with codecs.open(bigram_text_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in unigram_text:
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

In [9]:
bigram_text = LineSentence(bigram_text_filepath)

In [None]:
# Generating trigram.
if True:
    trigram_model = Phrases(bigram_text)
    trigram_model.save(trigram_model_filepath)

In [10]:
# Load trigram model
trigram_model = Phrases.load(trigram_model_filepath)

In [None]:
# Generating trigram text
if True:
    with codecs.open(trigram_text_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_text:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')

In [11]:
trigram_text = LineSentence(trigram_text_filepath)

In [None]:
# Generating normalized text
if True:
    with codecs.open(normalized_text_filepath, 'w', encoding='utf_8') as f:
        for parsed_text in nlp.pipe(line_generator(raw_text_file),
                                      batch_size=20000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_text = [token.lemma_ for token in parsed_text
                              if not is_punct(token)]
            print("unigram")
            
            # apply the first-order and second-order phrase models
            bigram_text = bigram_model[unigram_text]
            trigram_text = trigram_model[bigram_text]
            
            # remove any remaining stopwords
            trigram_text = [term for term in trigram_text
                              if term not in spacy.en.STOPWORDS]
            print("trigram")
            
            # write the transformed review as a line in the new file
            trigram_text = ' '.join(trigram_text)
            f.write(trigram_text + '\n')