# String Manipulation Benchmarks

Comparing performance of changing the first letter of strings to 'A' using:
- Pure Python loop
- List comprehension
- NumPy char operations
- Numba JIT
- PyArrow compute

## Setup

In [49]:
import random
import string
import timeit
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
from numba import njit
from numba.typed import List as NumbaList

## Generate Test Data

In [2]:
def generate_strings(n, length=10):
    """Generate n random strings of given length."""
    return [
        ''.join(random.choices(string.ascii_lowercase, k=length))
        for _ in range(n)
    ]

# Test sizes
SIZES = [1_000, 10_000, 100_000, 1_000_000]

# Generate data for largest size (we'll slice for smaller tests)
random.seed(42)
all_strings = generate_strings(max(SIZES))
print(f"Generated {len(all_strings):,} strings")
print(f"Sample: {all_strings[:5]}")

Generated 1,000,000 strings
Sample: ['qahftrxcka', 'fnafqofpva', 'usieyiccwp', 'usnzjovqwp', 'sbfhcgchqj']


## Implementations

In [3]:
# Pure Python loop
def replace_first_python_loop(strings):
    result = []
    for s in strings:
        result.append('A' + s[1:])
    return result

# List comprehension
def replace_first_listcomp(strings):
    return ['A' + s[1:] for s in strings]

# Verify correctness
test = ['hello', 'world', 'test']
print("Python loop:", replace_first_python_loop(test))
print("List comp:  ", replace_first_listcomp(test))

Python loop: ['Aello', 'Aorld', 'Aest']
List comp:   ['Aello', 'Aorld', 'Aest']


In [4]:
# Numba - note: numba has limited string support
# We need to use typed lists and work with the strings carefully

@njit
def replace_first_numba(strings):
    """Numba JIT version - limited string support."""
    n = len(strings)
    result = NumbaList()
    for i in range(n):
        s = strings[i]
        result.append('A' + s[1:])
    return result

# Create typed list for numba
def to_numba_list(strings):
    typed_list = NumbaList()
    for s in strings:
        typed_list.append(s)
    return typed_list

# Warm up numba (first call compiles)
numba_test = to_numba_list(test)
result = replace_first_numba(numba_test)
print("Numba:      ", list(result))

Numba:       ['Aello', 'Aorld', 'Aest']


In [30]:
# PyArrow compute
def replace_first_arrow(arr):
    """PyArrow version using string slicing and concatenation."""
    # Get substring from position 1 onwards
    suffix = pc.utf8_slice_codeunits(arr, 1)
    # Prepend 'A' using binary_join with a scalar
    n = len(arr)
    prefix = pa.array(['A'] * n)
    return pc.binary_join_element_wise(prefix, suffix, '')

# Alternative: using replace_substring (regex)
def replace_first_arrow_regex(arr):
    """PyArrow version using regex replace."""
    return pc.replace_substring_regex(arr, pattern='^.', replacement='A')

# Test
arrow_test = pa.array(test)
print("Arrow slice:", replace_first_arrow(arrow_test).to_pylist())
print("Arrow regex:", replace_first_arrow_regex(arrow_test).to_pylist())

Arrow slice: ['Aello', 'Aorld', 'Aest']
Arrow regex: ['Aello', 'Aorld', 'Aest']


In [None]:
# NumPy string operations
def replace_first_numpy(arr):
    """NumPy version using char array slicing."""
    # Slice from position 1 onwards
    return np.strings.add('A', np.strings.slice(arr, 1, 100,1))

ASCII_A = ord("A")
# Alternative: numba bit-fiddling
@njit
def replace_first_numpy_fixed(arr):
    """NumPy with fixed-width U strings - modify in place."""
    # Convert to character array for direct manipulation
    result = arr.copy()
    # Access as bytes and modify first character
    char_view = result.view(np.uint8).reshape(len(result), -1)
    for i in range(char_view.shape[0]):
        char_view[i, 0] = ASCII_A
    return result

# Test
numpy_test = np.array(test, dtype='U10')
print("NumPy add:  ", replace_first_numpy(numpy_test).tolist())
print("NumPy fixed:", replace_first_numpy_fixed(numpy_test.astype("S10")).astype("U10").tolist())

NumPy add:   ['Aello', 'Aorld', 'Aest']
NumPy fixed: ['Aello', 'Aorld', 'Aest']


## Benchmarks

In [27]:
def benchmark(func, data, n_runs=5):
    """Run benchmark and return average time in milliseconds."""
    times = timeit.repeat(lambda: func(data), number=1, repeat=n_runs)
    return min(times) * 1000  # Return best time in ms

results = []

In [48]:
print(f"{'Size':>10} {'Py Loop':>10} {'ListComp':>10} {'NP add':>10} {'NP fixed':>10} {'Numba':>10} {'Arrow':>10} {'Arrow RE':>10}")
print("-" * 92)

for size in SIZES:
    strings = all_strings[:size]
    
    # Prepare data for each method
    numpy_strings = np.array(strings, dtype='U10')
    numpy_numba_strings = np.array(strings, dtype='S10')
    numba_strings = to_numba_list(strings)
    arrow_strings = pa.array(strings)
    
    # Run benchmarks
    t_loop = benchmark(replace_first_python_loop, strings)
    t_comp = benchmark(replace_first_listcomp, strings)
    t_numpy = benchmark(replace_first_numpy, numpy_strings)
    t_numpy_fixed = benchmark(replace_first_numpy_fixed, numpy_numba_strings)
    t_numba = benchmark(replace_first_numba, numba_strings)
    t_arrow = benchmark(replace_first_arrow, arrow_strings)
    t_arrow_re = benchmark(replace_first_arrow_regex, arrow_strings)
    
    results.append({
        'size': size,
        'python_loop': t_loop,
        'list_comp': t_comp,
        'numpy_add': t_numpy,
        'numpy_fixed': t_numpy_fixed,
        'numba': t_numba,
        'arrow_slice': t_arrow,
        'arrow_regex': t_arrow_re,
    })
    
    print(f"{size:>10,} {t_loop:>9.2f}ms {t_comp:>9.2f}ms {t_numpy:>9.2f}ms {t_numpy_fixed:>9.2f}ms {t_numba:>9.2f}ms {t_arrow:>9.2f}ms {t_arrow_re:>9.2f}ms")

      Size    Py Loop   ListComp     NP add   NP fixed      Numba      Arrow   Arrow RE
--------------------------------------------------------------------------------------------
     1,000      0.06ms      0.05ms      0.03ms      0.01ms      0.09ms      0.05ms      0.05ms
    10,000      0.54ms      0.50ms      0.19ms      0.01ms      0.75ms      0.36ms      0.35ms
   100,000      6.03ms      5.48ms      2.09ms      0.05ms      9.31ms      3.19ms      3.57ms
 1,000,000     64.20ms     58.86ms     18.21ms      0.41ms    113.45ms     33.54ms     36.38ms


## Analysis

In [47]:
import pandas as pd

df = pd.DataFrame(results)
df = df.set_index('size')

# Calculate speedup relative to Python loop
print("Speedup vs Python Loop:")
print("-" * 60)
for col in ['list_comp', 'numpy_add', 'numpy_fixed', 'numba', 'arrow_slice', 'arrow_regex']:
    speedup = df['python_loop'] / df[col]
    print(f"{col:15s}: {speedup.iloc[-1]:>6.2f}x (at {SIZES[-1]:,} strings)")

Speedup vs Python Loop:
------------------------------------------------------------
list_comp      :   1.10x (at 1,000,000 strings)
numpy_add      :   3.47x (at 1,000,000 strings)
numpy_fixed    : 155.15x (at 1,000,000 strings)
numba          :   0.57x (at 1,000,000 strings)
arrow_slice    :   1.90x (at 1,000,000 strings)
arrow_regex    :   1.80x (at 1,000,000 strings)


In [10]:
# Throughput (strings per millisecond)
print("\nThroughput at 1M strings (strings/ms):")
print("-" * 50)
last = results[-1]
size = last['size']
for method in ['python_loop', 'list_comp', 'numpy_add', 'numpy_fixed', 'numba', 'arrow_slice', 'arrow_regex']:
    throughput = size / last[method]
    print(f"{method:15s}: {throughput:>12,.0f} strings/ms")


Throughput at 1M strings (strings/ms):
--------------------------------------------------
python_loop    :       15,402 strings/ms
list_comp      :       16,600 strings/ms
numpy_add      :        4,681 strings/ms
numpy_fixed    :      892,425 strings/ms
numba          :        9,116 strings/ms
arrow_slice    :       29,176 strings/ms
arrow_regex    :       27,442 strings/ms


## Including Data Conversion Overhead

In [11]:
# Real-world scenario: include conversion time
def numba_with_conversion(strings):
    typed_list = to_numba_list(strings)
    return replace_first_numba(typed_list)

def numpy_with_conversion(strings):
    arr = np.array(strings, dtype='U10')
    return replace_first_numpy(arr)

def numpy_fixed_with_conversion(strings):
    arr = np.array(strings, dtype='U10')
    return replace_first_numpy_fixed(arr)

def arrow_with_conversion(strings):
    arr = pa.array(strings)
    return replace_first_arrow(arr)

def arrow_regex_with_conversion(strings):
    arr = pa.array(strings)
    return replace_first_arrow_regex(arr)

print("\nWith data conversion overhead (1M strings):")
print("-" * 60)

strings = all_strings[:1_000_000]

t_loop = benchmark(replace_first_python_loop, strings)
t_comp = benchmark(replace_first_listcomp, strings)
t_numpy_conv = benchmark(numpy_with_conversion, strings)
t_numpy_fixed_conv = benchmark(numpy_fixed_with_conversion, strings)
t_numba_conv = benchmark(numba_with_conversion, strings)
t_arrow_conv = benchmark(arrow_with_conversion, strings)
t_arrow_re_conv = benchmark(arrow_regex_with_conversion, strings)

print(f"Python Loop:        {t_loop:>8.2f}ms")
print(f"List Comprehension: {t_comp:>8.2f}ms")
print(f"NumPy add (w/conv): {t_numpy_conv:>8.2f}ms")
print(f"NumPy fixed (w/):   {t_numpy_fixed_conv:>8.2f}ms")
print(f"Numba (w/ conv):    {t_numba_conv:>8.2f}ms")
print(f"Arrow slice (w/):   {t_arrow_conv:>8.2f}ms")
print(f"Arrow regex (w/):   {t_arrow_re_conv:>8.2f}ms")


With data conversion overhead (1M strings):
------------------------------------------------------------
Python Loop:           63.60ms
List Comprehension:    58.03ms
NumPy add (w/conv):   251.41ms
NumPy fixed (w/):      44.14ms
Numba (w/ conv):     1510.22ms
Arrow slice (w/):      43.80ms
Arrow regex (w/):      45.58ms


## Summary

Key findings:

1. **NumPy fixed-width** is the **fastest** (54x speedup) when data is already in NumPy format. The `view('U1')` trick allows direct character manipulation without string reconstruction.

2. **NumPy char.add** is actually **slower** (0.30x) because `np.char` lacks slicing - falls back to Python comprehension.

3. **PyArrow** is solid at ~2x speedup for the operation itself.

4. **List comprehension** is fastest for pure Python, marginally better than explicit loop.

5. **Numba** has limited string support - typed list overhead makes it slower than Python.

**With conversion overhead** (starting from Python list):
- NumPy fixed (43.6ms) â‰ˆ Arrow slice (44.1ms) - both ~1.3x faster than Python
- Numba (1534ms) is extremely slow due to typed list construction

**Recommendation**: 
- If data is already in NumPy fixed-width strings: use `view('U1')` manipulation
- If data is already in Arrow format: use `pc.utf8_slice_codeunits`
- For one-off operations on Python lists: use list comprehension