In [4]:
!pip install cython
!pip install spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/24/de/ac14cd453c98656d6738a5669f96a4ac7f668493d5e6b78227ac933c5fd4/spacy-2.0.12.tar.gz (22.0MB)
[K    100% |████████████████████████████████| 22.0MB 1.5MB/s 
Collecting murmurhash<0.29,>=0.28 (from spacy)
  Downloading https://files.pythonhosted.org/packages/5e/31/c8c1ecafa44db30579c8c457ac7a0f819e8b1dbc3e58308394fff5ff9ba7/murmurhash-0.28.0.tar.gz
Collecting cymem<1.32,>=1.30 (from spacy)
  Downloading https://files.pythonhosted.org/packages/f8/9e/273fbea507de99166c11cd0cb3fde1ac01b5bc724d9a407a2f927ede91a1/cymem-1.31.2.tar.gz
Collecting preshed<2.0.0,>=1.0.0 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/be/fc/09684555ce0ee7086675e6be698e4efeb6d9b315fd5aa96bed347572282b/preshed-1.0.1.tar.gz (112kB)
[K    100% |████████████████████████████████| 122kB 25.7MB/s 
[?25hCollecting thinc<6.11.0,>=6.10.3 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/94/b1/47a88

helllo
## Cython

![alt text](https://i.ytimg.com/vi/mXuEoqK4bEc/maxresdefault.jpg "Logo Title Text 1")


- Python is one of the most convenient, richly outfitted, and useful programming languages.
- Execution speed? Not so much.
-Enter Cython. The Cython language is a superset of Python that compiles to C, yielding  performance boosts that can range from a few percent to several orders of magnitude, depending on the task at hand. 
- For work that is bound by Python’s native object types, the speedups won’t be large. 
- But for numerical operations, or any operations not involving Python’s own internals, the gains can be massive. 
- This way, many of Python’s native limitations can be routed around or transcended entirely.

![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e0/Cython_CPython_Ext_Module_Workflow.png/220px-Cython_CPython_Ext_Module_Workflow.png "Logo Title Text 1")

### With Cython, you can skirt avoid of Python’s native limitations or transcend them entirely—without having to give up Python’s ease and convenience. 

![alt text](https://image.slidesharecdn.com/lez1-171127212726/95/snw-introduction-to-pynq-platform-and-python-language-16-638.jpg?cb=1511818642 "Logo Title Text 1")

- Python code can make calls directly into C modules. 
- These can be either generic C libraries or libraries built specifically to work with Python. 
- Cython generates the second kind of module: C libraries that talk to Python’s internals, and that can be bundled with existing Python code.
- Cython looks like Python
- If we feed the Cython compiler a Python program, it will accept it as-is, but none of Cython’s native accelerations will come into play. - - If we decorate the Python code with type annotations in Cython’s special syntax, Cython will be able to substitute fast C equivalents for slow Python objects.
- A developer can begin with an existing Python application, and speed it up by making spot changes to the code, rather than rewriting the whole application from the ground up.

### Most of the code in a Python application doesn’t need to be performance-optimized, just a few critical pieces. Pareto Principal




Lets try a simple example
- We have a big set of rectangles stored as a list of Python objects
-  Lets count how many rectangles we have whos area is larger than a threshold value


In [0]:
#pseudo random number generator, dimensions of rectangle
from random import random

#OOP 0:) 
class Rectangle:
    def __init__(self, w, h):
        self.w = w
        self.h = h
    #formula for area of rectangle
    def area(self):
        return self.w * self.h

#check each to see if its area is within our threshold
def check_rectangles_py(rectangles, threshold):
    #init the counter
    n_out = 0
    #for each rectangle
    for rectangle in rectangles:
        #check if its within our threshold, if so, add it to our counter
        if rectangle.area() > threshold:
            n_out += 1
    return n_out

def main_rectangles_slow():
    #init number of rectangles
    n_rectangles = 10000000
    #init rectangle object list
    rectangles = list(Rectangle(random(), random()) for i in range(n_rectangles))
    #perform check
    n_out = check_rectangles_py(rectangles, threshold=0.25)
    print(n_out)

In [6]:
%%time
# Let's run it:
main_rectangles_slow()

4032810
CPU times: user 16.9 s, sys: 1.6 s, total: 18.5 s
Wall time: 18.5 s


In [0]:
%load_ext Cython

In [0]:
%%cython
#memory management helper for Cython
from cymem.cymem cimport Pool
#good ol python
from random import random

#The cdef statement is used to declare C variables,types, and functions
cdef struct Rectangle:
    #C variables
    float w
    float h

#the "*" is the pointer operator, it gives value stored at particular address
#this saves memory and runs faster, since we don't have to duplicate the data
cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
    cdef int n_out = 0
    # C arrays contain no size information => we need to state it explicitly
    for rectangle in rectangles[:n_rectangles]:
        if rectangle.w * rectangle.h > threshold:
            n_out += 1
    return n_out

  #python uses garbage collection instead of manual memory management
  #which means developers can freely create objects
  #and Python's memory manager will periodically look for any
  # objects that are no longer referenced by their program
  #this overhead makes demands on the runtime environment (slower)
  # so manually memory management is better
def main_rectangles_fast():
    cdef int n_rectangles = 10000000
    cdef float threshold = 0.25
    #The Poool Object will save memory addresses internally
    #then free them when the object is garbage collected
    
    cdef Pool mem = Pool()
    cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
    for i in range(n_rectangles):
        rectangles[i].w = random()
        rectangles[i].h = random()
    n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
    print(n_out)

In [20]:
%%time
main_rectangles_fast()

4037437
CPU times: user 776 ms, sys: 26.7 ms, total: 803 ms
Wall time: 800 ms


![alt text](https://qph.fs.quoracdn.net/main-qimg-0a2f519557dead772f74a02aa267f431 "Logo Title Text 1")

- spaCy is a python library excels at large-scale information extraction tasks. 
- It's written from the ground up in carefully memory-managed Cython.
- Independent research has confirmed that spaCy is the fastest python NLP library in the world. 
- All strings in SpaCy are stored in a single data strcuture called the StringStore
- thats where they are indexed by 64 bit C level hashes
- StringStore implements a look up between Python unicode strings and 64 bit hashes
- i.e nlp.vocab.strings, doc.vocab.strings, etc.
- Need fast NLP? SpaCy will use the C level hashes instead of strings


![alt text](https://spacy.io/assets/img/architecture.svg "Logo Title Text 1")

- If your application needs to process entire web dumps, spaCy is the library you want to be using.
- like ruby on rails for natural language processing
- spaCy is the best way to prepare text for deep learning. 
- It interoperates seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of Python's awesome AI ecosystem. 
- With spaCy, you can easily construct linguistically sophisticated statistical models for a variety of NLP problems.

![alt text](https://nlpforhackers.io/wp-content/uploads/2018/03/spaCy.png "Logo Title Text 1")


In [12]:
# Set up spaCy
import spacy.cli
spacy.cli.download("en")
nlp = spacy.load('en')



# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."
parsedData = nlp(multiSentence)


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [20]:
# Let's look at the part of speech tags of the first sentence
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print(token.orth_, token.pos_)

There ADV
is VERB
an DET
art NOUN
, PUNCT
it PRON
says VERB
, PUNCT
or CCONJ
rather ADV
, PUNCT
a DET
knack NOUN
to ADP
flying NOUN
. PUNCT


In [17]:

# Let's look at the dependencies of this example:# Let's  
example = "The boy with the spotted dog quickly ran after the firetruck."
parsedEx = nlp(example)
# shown as: original token, dependency tag, head word, left dependents, right dependents
for token in parsedEx:
    print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])

The det boy [] []
boy nsubj ran ['The'] ['with']
with prep boy [] ['dog']
the det dog [] []
spotted amod dog [] []
dog pobj with ['the', 'spotted'] []
quickly advmod ran [] []
ran ROOT ran ['boy', 'quickly'] ['after', '.']
after prep ran [] ['firetruck']
the det firetruck [] []
firetruck pobj after ['the'] []
. punct ran [] []


In [32]:
# Let's look at the named entities of this example:
example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsedEx = nlp(example)
for token in parsedEx:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

print("-------------- entities only ---------------")
# if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
ents = list(parsedEx.ents)
for entity in ents:
    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))

Apple ORG
's (not an entity)
stocks (not an entity)
dropped (not an entity)
dramatically (not an entity)
after (not an entity)
the (not an entity)
death (not an entity)
of (not an entity)
Steve PERSON
Jobs PERSON
in (not an entity)
October DATE
. (not an entity)
-------------- entities only ---------------
381 ORG Apple
378 PERSON Steve Jobs
388 DATE October
