## All Libraries

In [10]:
import numpy as np 
import spacy as sp

## Load Spacy

In [13]:
nlp = sp.load('en_core_web_sm')

In [14]:
document = "An atom is the smallest unit of ordinary matter that forms a chemical element.[1] Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms"
print(type(document), document)

<class 'str'> An atom is the smallest unit of ordinary matter that forms a chemical element.[1] Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms


In [15]:
document = nlp(document)
print(type(document), document)

<class 'spacy.tokens.doc.Doc'> An atom is the smallest unit of ordinary matter that forms a chemical element.[1] Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms


In [16]:
for i in document:
    print(i)

An
atom
is
the
smallest
unit
of
ordinary
matter
that
forms
a
chemical
element.[1
]
Every
solid
,
liquid
,
gas
,
and
plasma
is
composed
of
neutral
or
ionized
atoms


In [17]:
for i in document.sents:
    print(i)

An atom is the smallest unit of ordinary matter that forms a chemical element.[1]
Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms


In [18]:
print(dir(document))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_context', '_get_array_attrs', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'copy', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_dict', 'from_disk', 'from_docs', 'get_extension', 'get_lca_matrix', 'has_annotation', 'has_extension', 'has_unknown_spaces', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'noun_chunks', 'noun_chunks_iterator', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_ents', 'set_extension', 'similarity', 'spans', 'tensor', 'text', 'text_with_ws', 

In [19]:
for i in document.sents:
    print(i)

An atom is the smallest unit of ordinary matter that forms a chemical element.[1]
Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms


## Stop Words

In [20]:
stop_word = sp.lang.en.STOP_WORDS
print(list(stop_word)[:10])

spacy_stopwords = sp.lang.en.stop_words.STOP_WORDS
print(list(spacy_stopwords)[:10])

['all', 'thereafter', 'you', 'against', 'fifteen', 'enough', 'their', 'it', '’s', 'forty']
['all', 'thereafter', 'you', 'against', 'fifteen', 'enough', 'their', 'it', '’s', 'forty']


In [22]:
document = "An atom is the smallest unit of ordinary matter that forms a chemical element. Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms"
print(type(document))

document = nlp(document)
print(type(document))

<class 'str'>
<class 'spacy.tokens.doc.Doc'>


In [23]:
document_list = [token.text for token in document if not token.is_stop]
print(document_list)

['atom', 'smallest', 'unit', 'ordinary', 'matter', 'forms', 'chemical', 'element', '.', 'solid', ',', 'liquid', ',', 'gas', ',', 'plasma', 'composed', 'neutral', 'ionized', 'atoms']


## Regular Expression

## `1. re.findall(...)`

In [24]:
import re
text = " This is a string that contains some 131231 random 9449 number in 83782 it."

numbers = re.findall('\d+', text)
print(numbers)

['131231', '9449', '83782']


In [25]:
text = """
There are 3
Numbers in this (2 remaining)
text. Still 1 is missing.
"""

pattern = r'\d+'
re.findall(pattern, text, re.MULTILINE)

['3', '2', '1']

## `2. re.search(...)`

In [26]:
pattern = '\d+'
text = " This is a string that contains some 131231 random 9449 number in 83782 it."
numbers = re.findall(pattern, text)
print(numbers)

pattern = 'contaiN'
if re.search(pattern, text, re.IGNORECASE):
    print('Matched')
else:
    print('Not matched')

['131231', '9449', '83782']
Matched


In [27]:
text = " This is a string that contains some 131231 random 9449 number in 83782 it."
print(text.find('1'), text.find('1 '), text.find('it'), len(text))

pattern = "\d+"

match = re.search(pattern, text)
print(dir(match))

37 42 72 75
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']


## `3. Examples`

In [30]:
text = "This is a string that contains some 131231 random 9449 number in 83782 it."

print('match.end         =', match.end())
print('match.endpos      =', match.endpos)
print('match.group       =', match.group())
print('match.groupdict   =', match.groupdict())
print('match.groups      =', match.groups())
print('match.lastgroup   =', match.lastgroup)
print('match.lastindex   =', match.lastindex)
print('match.pos         =', match.pos)
print('match.re          =', match.re)
print('match.regs        =', match.regs)
print('match.span        =', match.span())
print('match.start       =', match.start())
print('match.string      =', match.string)

match.end         = 43
match.endpos      = 75
match.group       = 131231
match.groupdict   = {}
match.groups      = ()
match.lastgroup   = None
match.lastindex   = None
match.pos         = 0
match.re          = re.compile('\\d+')
match.regs        = ((37, 43),)
match.span        = (37, 43)
match.start       = 37
match.string      =  This is a string that contains some 131231 random 9449 number in 83782 it.


In [31]:
start, end = match.span()
char = text[start:end]
print(char)

31231 


## `4. re.match(...)`

In [32]:
pattern = 'This'
text1  = 'this is an example'
text2  = 'and this is an example'

if re.match(pattern, text1, re.IGNORECASE):
    print('pattern is found at', match.start(), 'position')
else:
    print('pattern not found')

if re.match(pattern, text2):
    print('pattern is found at', match.start(), 'position')
else:
    print('pattern not found')

pattern is found at 37 position
pattern not found


## `5. re.split(...)`

In [33]:
text = "This is a sentence. This is another sentence. End"
print(text.split("."))

pattern = '\.'
re.split(pattern, text)

['This is a sentence', ' This is another sentence', ' End']


['This is a sentence', ' This is another sentence', ' End']

In [34]:
text = """This is a sentence \n
        -------------------- \n
        This is another sentence. \n
        ------------------------- \n
        End"""

pattern = "\s*-+\s*|\.*\s*\n\W*" 

re.split(pattern, text)

['This is a sentence', 'This is another sentence', 'End']

## How to Preprocess TEXT

In [2]:
def preprocess(path):
     with open(path, 'r') as f:
          lines = f.readlines()
          for i in range(len(lines)):
               lines[i] = lines[i].replace('.', '').replace(',', '').replace(':', '').replace(';', '').replace('?', '').replace('!', '').replace('-', '').replace('_', '').replace('\'', '').replace('\"', '').lower()
          with open('./data/article_processed.txt', 'w') as p:
               p.writelines(lines)
     return 

In [3]:
def get_sentences(path):
     f = open(path, 'r')
     lines = f.readlines()
     sentences = [line.split() for line in lines]
     f.close()
     return sentences 

In [4]:
def clean_sentences(sentences):
     i = 0 
     while i <  len(sentences):
          if sentences[i] == []:
               sentences.pop(i)
          else: 
               i += 1 
     return sentences 

In [5]:
sentences = get_sentences('data/article_processed.txt')
cleaned_sents = clean_sentences(sentences)

In [6]:
def get_dictionary(sentences):
    vocab = []
    for sentence in sentences: 
        for token in sentence:
            if token not in vocab:
                vocab.append(token)
    word2idx = {word : idx  for (idx, word) in enumerate(vocab)}
    idx2word = {idx  : word for (idx, word) in enumerate(vocab)}

    return word2idx, idx2word, len(vocab)

In [7]:
word2idx, idx2word, len_vocab = get_dictionary(sentences)

In [8]:
def get_pairs(sentences, word2idx, r):
    pairs = []
    for sentence in sentences: 
        tokens = [word2idx[word] for word in sentence]

        for center in range(len(tokens)):
            for context in range(-r, r+1):
                context_word = center + context

                if context_word < 0 or context_word >= len(tokens) or context_word == center:
                        continue 
                else: 
                        pairs.append( (tokens[center], tokens[context_word]) )
    return np.array(pairs)

In [9]:
def get_dataset():
    sentences = get_sentences('data/article_processed.txt')
    clean_sents = clean_sentences(sentences)
    word2idx, idx2word, len_vocab = get_dictionary(clean_sents)
    pairs = get_pairs(clean_sents, word2idx, 4)

    return pairs, len_vocab

print(get_dataset())

(array([[  0,   1],
       [  0,   2],
       [  0,   3],
       ...,
       [335, 333],
       [335, 334],
       [335,  97]]), 336)
