In [2]:
%load_ext cython

In [3]:
import random
import string 
import numpy as np
from pprint import pprint

# Levenshtein distance 

In [3]:
# levenshtein distance 

def levenshtein_distance(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a 2D array (matrix) to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

In [5]:

N = 2_000
# Example usage:
string1 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N))
string2 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N))
_ = levenshtein_distance(string1, string2)


In [6]:
%timeit levenshtein_distance(string1, string2)

1.64 s ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%cython -a

def levenshtein_distance_c0(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a 2D array (matrix) to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

Content of stderr:

In [10]:
%timeit levenshtein_distance_c0(string1, string2)

1.2 s ± 4.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%cython -a

cimport cython

def levenshtein_distance_c1(str1, str2):
    cdef int m,n,i,j
    m = len(str1)
    n = len(str2)

    # Create a 2D array (matrix) to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

In [12]:
%timeit levenshtein_distance_c1(string1, string2)

812 ms ± 7.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%cython -a

cimport cython

@cython.boundscheck(False) # turns of bound-checking for entire function
@cython.wraparound(False)      # turns of negative index warpping for entire function
def levenshtein_distance_c2(str1, str2):
    cdef int m,n,i,j
    m = len(str1)
    n = len(str2)

    # Create a 2D array (matrix) to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

In [14]:
%timeit levenshtein_distance_c2(string1, string2)

784 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%cython -a

cimport cython

@cython.boundscheck(False) # turns of bound-checking for entire function
@cython.wraparound(False)      # turns of negative index warpping for entire function
def levenshtein_distance_c3(str1 :str, str2 :str):
    cdef int m,n,i,j
    m = len(str1)
    n = len(str2)

    # Create a 2D array (matrix) to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

In [16]:
%timeit levenshtein_distance_c3(string1, string2)

512 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%cython -a

cimport cython
import numpy as np
cimport numpy as cnp

@cython.boundscheck(False) # turns of bound-checking for entire function
@cython.wraparound(False)      # turns of negative index warpping for entire function
def levenshtein_distance_c4(str1 :str, str2 :str):
    cdef int m,n,i,j
    m ,n =  len(str1), len(str2)

    # Create a 2D array (matrix) to store distances
    # dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Create numpy array initialized with zeros 
    cdef cnp.ndarray[int, ndim=2] pre = np.zeros((m + 1, n + 1), dtype = np.int32)
    cdef int[:, :] dp = pre

    # Initialize the first row and column
    for i in range(m + 1):
        dp[i][0] = i  # Distance from empty string to str1[:i] is i deletions
    for j in range(n + 1):
        dp[0][j] = j  # Distance from empty string to str2[:j] is j insertions

    # Fill the rest of the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost # Substitution or Match
            )

    return dp[m][n]

In [27]:
%timeit levenshtein_distance_c4(string1, string2)

336 ms ± 538 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
%%cython -a

cimport cython
import numpy as np
cimport numpy as cnp

@cython.boundscheck(False) # turns of bound-checking for entire function
@cython.wraparound(False)    
def levenshtein_cython(str1: str, str2: str) :
    
    cdef int m,n,i,j
    m, n = len(str1), len(str2)
    
    
    cdef cnp.ndarray[int, ndim=2] pre = np.zeros((m + 1, n + 1), dtype = np.int32)
    cdef int[:, :] dp = pre
 
    
    for i in range(m + 1): 
        for j in range(n + 1):
    
            if   i ==    0: dp[i][j] = j        
            elif j ==    0: dp[i][j] = i
            else:
                dp[i][j] = min(dp[i - 1][j - 1] + (str1[i - 1] != str2[j - 1]),
                               dp[i- 1][j] + 1,
                               dp[i][j- 1] +1 )

Content of stderr:
In file included from /Users/nithinvarghese/.cache/ipython/cython/_cython_magic_a621a4f38d6bcd31d802ca5a0799867a41d4749fa4d2c936133011b6a9065bdd.c:1155:
In file included from /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/numpy/core/include/numpy/arrayobject.h:5:
In file included from /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/numpy/core/include/numpy/ndarrayobject.h:12:
In file included from /opt/homebrew/Caskroom/miniconda/base/lib/python3.12/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929:
      |  ^

In [42]:
%timeit levenshtein_cython(string1, string2)

18.7 ms ± 236 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [44]:
%timeit levenshtein_distance(string1, string2)

1.65 s ± 8.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
1.65/0.0018

916.6666666666666

# Sieve algorithm

In [65]:
def sieve(sieve_length):
    sieve_table = [True for x in range(sieve_length)]
    sieve_table[0] = False
    sieve_table[1] = False

    for i in range(2, int(sieve_length ** 0.5)-1):
        if not sieve_table[i]:
            continue
        for marker in range(i*i, sieve_length, i):
            sieve_table[marker] = False
    return [i for i , t in enumerate(sieve_table) if t]

In [66]:
print(list((i for i in sieve(1000))))

[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 961, 967, 971, 977, 983, 991, 997]


In [67]:
%timeit sieve(1_000_000)

129 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [70]:
%%cython -a

def sieve_c0(sieve_length):
    sieve_table = [True for x in range(sieve_length)]
    sieve_table[0] = False
    sieve_table[1] = False

    for i in range(2, int(sieve_length ** 0.5)-1):
        if not sieve_table[i]:
            continue
        for marker in range(i*i, sieve_length, i):
            sieve_table[marker] = False
    return [i for i , t in enumerate(sieve_table) if t]
    

Content of stderr:

In [69]:
%timeit sieve_c0(1_000_000)

98.6 ms ± 759 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [74]:
%%cython -a

def sieve_c1(int sieve_length ):
    sieve_table = [True for x in range(sieve_length)]
    sieve_table[0] = False
    sieve_table[1] = False

    cdef int i, marker
    cdef int upper =  int(sieve_length ** 0.5) - 1
    for i in range(2, upper ):
        if not sieve_table[i]:
            continue
        for marker in range(i*i, sieve_length, i):
            sieve_table[marker] = False
    return [i for i , t in enumerate(sieve_table) if t]
    

Content of stderr:

In [76]:
%timeit sieve_c1(1_000_000)

55 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Splitting things up 

In [78]:
%%cython -a

def sieve_c2(int sieve_length ):
    sieve_table = [True for x in range(sieve_length)]
    sieve_table[0] = False
    sieve_table[1] = False

    cdef int i, marker
    cdef int upper =  int(sieve_length ** 0.5) - 1
    for i in range(2, upper ):
        if not sieve_table[i]:
            continue
        for marker in range(i*i, sieve_length, i):
            sieve_table[marker] = False
    return sieve_table


def sieve_print_cy(table):
    cdef int i
    cdef list primes = []
    for i in range(len(table)):
        if table[i]:
            primes.append(i)
    return primes
    

Content of stderr:

In [79]:
%%timeit
table = sieve_c2(1_000_000)
prime_list = sieve_print_cy(table)


57.8 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## STL

In [80]:
%reload_ext Cython

In [85]:
%%cython
# distutils: language=c++

import cython
from libcpp.vector cimport vector

def do_stuff():
    cdef vector[int] totally_a_list
    totally_a_list.push_back(100)
    return totally_a_list[0]

Content of stderr:

In [86]:
do_stuff()

100

In [4]:
%%cython -a
# distutils: language=c++
from libcpp.vector cimport vector



def sieve_c3(int sieve_length ):
    cdef vector[int] sieve_table
    sieve_table[0] = False
    sieve_table[1] = False

    cdef int i, marker
    cdef int upper =  int(sieve_length ** 0.5) - 1
    for i in range(2, upper ):
        if not sieve_table[i]:
            continue
        for marker in range(i*i, sieve_length, i):
            sieve_table[marker] = False
    return sieve_table


def sieve_print_c1(table):
    cdef int i
    cdef list primes = []
    for i in range(len(table)):
        if table[i]:
            primes.append(i)
    return primes
    

In [None]:
%%timeit
table = sieve_c3(1_000)
prime_list = sieve_print_c1(table)


## Batting inner for loop

In [4]:
%%cython -a
# distutils: language=c++
from libcpp.vector cimport vector



def sieve_c3(int sieve_length ):
    cdef vector[int] sieve_table
    sieve_table[0] = 0
    sieve_table[1] = 0

    cdef int i, marker
    cdef int upper =  int(sieve_length ** 0.5) - 1
    for i in range(2, upper ):
        if not sieve_table[i]:
            continue
        marker = i * i
        while marker < sieve_length:
            sieve_table[marker] = 0
            marker += 1
    return sieve_table


def sieve_print_c1(table):
    cdef int i
    cdef list primes = []
    for i in range(len(table)):
        if table[i]:
            primes.append(i)
    return primes
    

Content of stderr:

In [None]:
%%timeit
table = sieve_c3(1_000)
prime_list = sieve_print_c1(table)

# Reference 
https://www.youtube.com/watch?v=y6bKDKFavPA