In [1]:
!pip install cython
!pip install spacy


Collecting cython
[?25l  Downloading https://files.pythonhosted.org/packages/19/8e/32b280abb0947a96cdbb8329fb2014851a21fc1d099009f946ea8a8202c3/Cython-0.28.5-cp36-cp36m-manylinux1_x86_64.whl (3.4MB)
[K    100% |████████████████████████████████| 3.4MB 5.6MB/s 
[?25hInstalling collected packages: cython
Successfully installed cython-0.28.5
Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/24/de/ac14cd453c98656d6738a5669f96a4ac7f668493d5e6b78227ac933c5fd4/spacy-2.0.12.tar.gz (22.0MB)
[K    100% |████████████████████████████████| 22.0MB 1.5MB/s 
Collecting murmurhash<0.29,>=0.28 (from spacy)
  Downloading https://files.pythonhosted.org/packages/5e/31/c8c1ecafa44db30579c8c457ac7a0f819e8b1dbc3e58308394fff5ff9ba7/murmurhash-0.28.0.tar.gz
Collecting cymem<1.32,>=1.30 (from spacy)
  Downloading https://files.pythonhosted.org/packages/f8/9e/273fbea507de99166c11cd0cb3fde1ac01b5bc724d9a407a2f927ede91a1/cymem-1.31.2.tar.gz
Collecting preshed<2.0.0,>=1.0.0 (from spacy)

In [0]:
from random import random
class Rectangle:
  def __init__(self,w,h):
    self.w = w
    self.h = h
    
  def area(self):
    return self.w * self.h
  
def check_rectangle(rectangles, threshold):
  n_out = 0
  for rectangle in rectangles:
    if rectangle.area() > threshold:
      n_out += 1
    return n_out
  
def main_rectangle_slow():
  n_rectangles = 10000000
  rectangles = list(Rectangle(random(),random()) for i in range(n_rectangles))
  n_out = check_rectangle(rectangles, threshold = 0.25 )

In [7]:
%%time
main_rectangle_slow()

CPU times: user 13 s, sys: 1.55 s, total: 14.5 s
Wall time: 14.5 s


In [13]:
%load_ext cython


The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [0]:
%%cython
#memory management helper for Cython
from cymem.cymem cimport Pool
#good ol python
from random import random

#The cdef statement is used to declare C variables,types, and functions
cdef struct Rectangle:
    #C variables
    float w
    float h

#the "*" is the pointer operator, it gives value stored at particular address
#this saves memory and runs faster, since we don't have to duplicate the data
cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
    cdef int n_out = 0
    # C arrays contain no size information => we need to state it explicitly
    for rectangle in rectangles[:n_rectangles]:
        if rectangle.w * rectangle.h > threshold:
            n_out += 1
    return n_out

  #python uses garbage collection instead of manual memory management
  #which means developers can freely create objects
  #and Python's memory manager will periodically look for any
  # objects that are no longer referenced by their program
  #this overhead makes demands on the runtime environment (slower)
  # so manually memory management is better
def main_rectangles_fast():
    cdef int n_rectangles = 10000000
    cdef float threshold = 0.25
    #The Poool Object will save memory addresses internally
    #then free them when the object is garbage collected
    
    cdef Pool mem = Pool()
    cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
    for i in range(n_rectangles):
        rectangles[i].w = random()
        rectangles[i].h = random()
    n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
    print(n_out)

In [24]:
%%time
main_rectangles_fast()

4034318
CPU times: user 735 ms, sys: 8.95 ms, total: 744 ms
Wall time: 744 ms


In [26]:
# Set up spaCy
import spacy.cli
spacy.cli.download("en")
nlp = spacy.load('en')



# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."
parsedData = nlp(multiSentence)


# Let's look at the part of speech tags of the first sentence
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print(token.orth_, token.pos_)


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')

There ADV
is VERB
an DET
art NOUN
, PUNCT
it PRON
says VERB
, PUNCT
or CCONJ
rather ADV
, PUNCT
a DET
knack NOUN
to ADP
flying NOUN
. PUNCT
