# Simple NLP procedures 

This procedures includes:
1. Text files parsing.
2. Text files cleaning.
3. Text lemmatization.

In [20]:
import re
import os
import codecs
import spacy
import pandas as pd
import itertools as it

import settings

In [3]:
# Load spacy. Takes time. Spacy yet has a model of english languge. 
nlp = spacy.load('en')

In [40]:
# Load sample file to test
test_file = os.path.join(settings.DATA_PATH, 'raw_book1.txt')

with codecs.open(test_file, 'r', encoding='utf-8') as f:
    sample_test = list(it.islice(f,0,500))[0:500]
    sample_test = ' '.join(sample_test)
    sample_test = re.sub('[\n \t]+',' ', sample_test)
    sample_test = re.sub('\x0c2', '', sample_test)

#print(sample_test)

In [42]:
# Parse text with spacy nlp

parsed_text = nlp(sample_test)

In [43]:
# Detect sentences and enumerate them
if False:
    for num, sentence in enumerate(parsed_text.sents):
        print('<Sentence {}>:'.format(num + 1))
        print(sentence)
        print('')

In [45]:
# Entity detection
if False:
    for num, entity in enumerate(parsed_text.ents):
        print('<Entity {}>:'.format(num + 1), entity, '-', entity.label_)
        print('')

In [55]:
# POS tagging
token_text = [token.orth_ for token in parsed_text]
token_pos = [token.pos_ for token in parsed_text]

In [65]:
# Convert to data frame
pd.DataFrame(list(zip(token_text, token_pos)), columns=['token', 'pos'])

Unnamed: 0,token,pos
0,Chapter,NOUN
1,1,NUM
2,Introduction,NOUN
3,to,ADP
4,Deal‐Making,VERB
5,Deal‐Making,VERB
6,in,ADP
7,Practice,PROPN
8,,SPACE
9,Looking,VERB


In [68]:
# Normalization lemmatization
token_lemma = [token.lemma_ for token in parsed_text]
token_shape = [token.shape_ for token in parsed_text]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)), columns=['text', 'lemma', 'shape'])

Unnamed: 0,text,lemma,shape
0,Chapter,chapter,Xxxxx
1,1,1,d
2,Introduction,introduction,Xxxxx
3,to,to,xx
4,Deal‐Making,deal‐mak,Xxxx‐Xxxxx
5,Deal‐Making,deal‐mak,Xxxx‐Xxxxx
6,in,in,xx
7,Practice,practice,Xxxxx
8,,,
9,Looking,look,Xxxxx


In [67]:
# Token levels analysis
token_entity_type = [token.ent_type_ for token in parsed_text]
token_entity_iob = [token.ent_iob_ for token in parsed_text]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,Chapter,LAW,B
1,1,LAW,I
2,Introduction,,O
3,to,,O
4,Deal‐Making,,O
5,Deal‐Making,,O
6,in,,O
7,Practice,,O
8,,,O
9,Looking,,O


In [71]:
# Common statistics
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_text]

# Convert to panda's DataFrame
attributes_df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

attributes_df.loc[:, 'stop?':'out of vocab.?'] = (attributes_df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: 'Yes' if x else ''))

# Show attributes
attributes_df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Chapter,-12.687860,,,,,
1,1,-7.639833,,,,Yes,
2,Introduction,-13.933037,,,,,
3,to,-3.856022,Yes,,,,
4,Deal‐Making,-19.502029,,,,,Yes
5,Deal‐Making,-19.502029,,,,,Yes
6,in,-4.619072,Yes,,,,
7,Practice,-12.490758,,,,,
8,,-17.022211,,,Yes,,
9,Looking,-10.617382,,,,,


### High level text processing
Colocations and phrases detection
It is very important to subdivide text to sentences. Colocations and phrases do not cross the borders of sentences.

In [100]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

unigram_text_filepath = os.path.join(settings.DATA_PATH, 'unigram_text.txt')
bigram_model_filepath = os.path.join(settings.DATA_PATH, 'bigram_model')
bigram_text_filepath = os.path.join(settings.DATA_PATH,'bigram_text.txt')
trigram_model_filepath = os.path.join(settings.DATA_PATH,'trigram_model')
trigram_text_filepath = os.path.join(settings.DATA_PATH,'trigram_text.txt')

normalized_text_filepath = os.path.join(settings.DATA_PATH,'normalized_text.txt')

In [101]:
# Helper functions for processing
def is_punct(token):
    """
    Returns True if token is punctuation or space
    """
    return token.is_punct or token.is_space

def line_generator(filename):
    """
    Returns escaped line generator
    """
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def text_lemmatizer(filename):
    """
    Lemmatizes the text, and yield sentences
    """
    for parsed_chunk in nlp.pipe(line_generator(filename),batch_size=10000, n_threads=4):
        for sent in parsed_chunk.sents:
            yield ' '.join([token.lemma_ for token in sent if not is_punct(token)])

In [102]:
# Making unigram. Load raw file and save as unigram text.
if True:
    with codecs.open(unigram_text_filepath, 'w', encoding='utf_8') as f:
        for sentence in text_lemmatizer(test_file):
            f.write(sentence + '\n')

In [103]:
unigram_text = LineSentence(unigram_text_filepath)

In [104]:
# Generating bigram model
if True:
    bigram_model = Phrases(unigram_text)
    bigram_model.save(bigram_model_filepath)

In [105]:
# Load bigram model
bigram_model = Phrases.load(bigram_model_filepath)

In [106]:
# Making bigram text.
if True:
    with codecs.open(bigram_text_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in unigram_text:
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

In [107]:
bigram_text = LineSentence(bigram_text_filepath)

In [108]:
# Generating trigram.
if True:
    trigram_model = Phrases(bigram_text)
    trigram_model.save(trigram_model_filepath)

In [109]:
# Load trigram model
trigram_model = Phrases.load(trigram_model_filepath)

In [110]:
# Generating trigram text
if True:
    with codecs.open(trigram_text_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_text:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')

In [111]:
trigram_text = LineSentence(trigram_text_filepath)

In [113]:
# Generating normalized text
if True:
    with codecs.open(normalized_text_filepath, 'w', encoding='utf_8') as f:
        for parsed_text in nlp.pipe(line_generator(test_file),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_text = [token.lemma_ for token in parsed_text
                              if not is_punct(token)]
            
            # apply the first-order and second-order phrase models
            bigram_text = bigram_model[unigram_text]
            trigram_text = trigram_model[bigram_text]
            
            # remove any remaining stopwords
            trigram_text = [term for term in trigram_text
                              if term not in spacy.en.STOPWORDS]
            
            # write the transformed review as a line in the new file
            trigram_text = u' '.join(trigram_text)
            f.write(trigram_text + '\n')