In [108]:
import numpy as np

# Generate large arrays for benchmarking
prices = np.random.randint(100, 10000, size=1_000_000)
quantities = np.random.randint(1, 10, size=1_000_000)

print(f"Array size: {len(prices):,} elements")
print(f"Prices range: {prices.min()} - {prices.max()}")
print(f"Quantities range: {quantities.min()} - {quantities.max()}")


Array size: 1,000,000 elements
Prices range: 100 - 9999
Quantities range: 1 - 9


In [109]:
def loop_based_revenue(prices, quantities):
    """
    Calculate revenue using a Python loop.
    """
    revenue = []
    for i in range(len(prices)):
        revenue.append(prices[i] * quantities[i])
    return revenue


In [110]:
# Test with small array first
test_prices = prices[:3]
test_quantities = quantities[:3]
test_result = loop_based_revenue(test_prices, test_quantities)
print("Test result (first 5):", test_result[:5])


Test result (first 5): [np.int32(27152), np.int32(6092), np.int32(1516)]


In [111]:
# Note: %timeit is a Jupyter magic command
# If using a script, use time.time() instead

# In Jupyter Notebook:
# %timeit loop_based_revenue(prices, quantities)

# In Python script, use:
import time

start_time = time.time()
loop_result = loop_based_revenue(prices, quantities)
loop_time = time.time() - start_time

print(f"Loop-based calculation time: {loop_time:.4f} seconds")


Loop-based calculation time: 0.9860 seconds


In [112]:
def numpy_vectorized_revenue(prices, quantities):
    """
    Calculate revenue using NumPy vectorization.
    """
    return prices * quantities


In [113]:
test_result_numpy = numpy_vectorized_revenue(test_prices, test_quantities)
print("NumPy test result (first 5):", test_result_numpy[:5])


NumPy test result (first 5): [27152  6092  1516]


In [114]:
start_time = time.time()
numpy_result = numpy_vectorized_revenue(prices, quantities)
numpy_time = time.time() - start_time

print(f"NumPy vectorized calculation time: {numpy_time:.4f} seconds")


NumPy vectorized calculation time: 0.0050 seconds


In [115]:
# Compare results (first 1000 elements for speed)
comparison = np.allclose(loop_result[:1000], numpy_result[:1000])
print(f"Results match: {comparison}")


Results match: True


In [116]:
speedup = loop_time / numpy_time
print("=== PERFORMANCE COMPARISON ===")
print(f"Loop-based time: {loop_time:.4f} seconds")
print(f"NumPy vectorized time: {numpy_time:.4f} seconds")
print(f"Speedup factor: {speedup:.2f}x faster")


=== PERFORMANCE COMPARISON ===
Loop-based time: 0.9860 seconds
NumPy vectorized time: 0.0050 seconds
Speedup factor: 197.20x faster


In [117]:
improvement = ((loop_time - numpy_time) / loop_time) * 100
print(f"Performance improvement: {improvement:.1f}%")


Performance improvement: 99.5%


In [118]:
print("\n=== WHY NUMPY IS FASTER ===")
print("1. NumPy operations are implemented in C (compiled code)")
print("2. Vectorized operations process multiple elements simultaneously")
print("3. No Python loop overhead (interpreted code)")
print("4. Better memory access patterns")
print("5. Optimized for numerical computations")



=== WHY NUMPY IS FASTER ===
1. NumPy operations are implemented in C (compiled code)
2. Vectorized operations process multiple elements simultaneously
3. No Python loop overhead (interpreted code)
4. Better memory access patterns
5. Optimized for numerical computations


In [119]:
# Create performance summary
performance_summary = {
    "array_size": len(prices),
    "loop_time_seconds": loop_time,
    "numpy_time_seconds": numpy_time,
    "speedup_factor": speedup,
    "improvement_percent": improvement
}

print("\n=== PERFORMANCE SUMMARY ===")
for key, value in performance_summary.items():
    print(f"{key}: {value}")



=== PERFORMANCE SUMMARY ===
array_size: 1000000
loop_time_seconds: 0.9860000610351562
numpy_time_seconds: 0.004999876022338867
speedup_factor: 197.2049020075342
improvement_percent: 99.49291321370815


In [120]:
markdown_content = f"""# Performance Comparison: Loops vs NumPy

## Test Configuration
- Array size: {len(prices):,} elements
- Operation: Revenue calculation (price × quantity)

## Results

### Loop-Based Approach
- Execution time: {loop_time:.4f} seconds
- Method: Python for loop with list append

### NumPy Vectorized Approach
- Execution time: {numpy_time:.4f} seconds
- Method: Element-wise array multiplication

## Performance Analysis

### Speedup
- **Speedup factor:** {speedup:.2f}x faster
- **Performance improvement:** {improvement:.1f}%

### Why NumPy is Faster

1. **Compiled Code**: NumPy operations are implemented in C, which is much faster than interpreted Python code.

2. **Vectorization**: NumPy processes multiple elements simultaneously using CPU vectorization (SIMD instructions).

3. **No Loop Overhead**: Python loops have significant overhead for each iteration. NumPy eliminates this overhead.

4. **Memory Efficiency**: NumPy uses contiguous memory blocks, which improves cache performance.

5. **Optimized Algorithms**: NumPy uses highly optimized BLAS/LAPACK libraries for numerical operations.

## Conclusion

For large-scale data processing, NumPy vectorization provides significant performance benefits over Python loops. The speedup of {speedup:.2f}x demonstrates why vectorization is essential in data engineering workflows.

## Recommendations

- Always prefer NumPy vectorized operations over Python loops for numerical computations
- Use vectorization even for small arrays to maintain code consistency
- Consider NumPy when processing datasets with 10,000+ elements
"""

print(markdown_content)


# Performance Comparison: Loops vs NumPy

## Test Configuration
- Array size: 1,000,000 elements
- Operation: Revenue calculation (price × quantity)

## Results

### Loop-Based Approach
- Execution time: 0.9860 seconds
- Method: Python for loop with list append

### NumPy Vectorized Approach
- Execution time: 0.0050 seconds
- Method: Element-wise array multiplication

## Performance Analysis

### Speedup
- **Speedup factor:** 197.20x faster
- **Performance improvement:** 99.5%

### Why NumPy is Faster

1. **Compiled Code**: NumPy operations are implemented in C, which is much faster than interpreted Python code.

2. **Vectorization**: NumPy processes multiple elements simultaneously using CPU vectorization (SIMD instructions).

3. **No Loop Overhead**: Python loops have significant overhead for each iteration. NumPy eliminates this overhead.

4. **Memory Efficiency**: NumPy uses contiguous memory blocks, which improves cache performance.

5. **Optimized Algorithms**: NumPy uses highly 

In [121]:
with open("performance_comparison.md", "w") as f:
    f.write(markdown_content)

print("\nSaved: performance_comparison.md")



Saved: performance_comparison.md


In [122]:
def benchmark_loops_vs_numpy(array_size=1_000_000):
    """
    Benchmark loop-based vs NumPy vectorized revenue calculation.
    """
    # Generate test data
    prices = np.random.randint(100, 10000, size=array_size)
    quantities = np.random.randint(1, 10, size=array_size)
    
    # Loop-based approach
    start = time.time()
    loop_result = loop_based_revenue(prices, quantities)
    loop_time = time.time() - start
    
    # NumPy vectorized approach
    start = time.time()
    numpy_result = numpy_vectorized_revenue(prices, quantities)
    numpy_time = time.time() - start
    
    # Calculate metrics
    speedup = loop_time / numpy_time
    improvement = ((loop_time - numpy_time) / loop_time) * 100
    
    # Verify results match
    results_match = np.allclose(loop_result[:1000], numpy_result[:1000])
    
    print("=== BENCHMARK RESULTS ===")
    print(f"Array size: {array_size:,}")
    print(f"Loop time: {loop_time:.4f}s")
    print(f"NumPy time: {numpy_time:.4f}s")
    print(f"Speedup: {speedup:.2f}x")
    print(f"Results match: {results_match}")
    
    return {
        "loop_time": loop_time,
        "numpy_time": numpy_time,
        "speedup": speedup,
        "improvement": improvement
    }


In [123]:
results = benchmark_loops_vs_numpy(1_000_000)


=== BENCHMARK RESULTS ===
Array size: 1,000,000
Loop time: 0.5570s
NumPy time: 0.0010s
Speedup: 556.91x
Results match: True
