# Sparse Array Operations
- Arithmetic, Slicing, Matrix operations
- Real examples: Document similarity, Matrix computations

In [1]:
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
print('Sparse operations module loaded')

Sparse operations module loaded


## Arithmetic Operations

Sparse matrices support standard operations:
- **Addition/Subtraction**: A + B, A - B
- **Multiplication**: A * B (matrix), A.multiply(B) (element-wise)
- **Scalar**: A * scalar, A / scalar
- **Power**: A ** n

**Result**: Usually sparse if inputs sparse

In [2]:
# Create sparse matrices
A = sparse.csr_array([[1, 0, 2], [0, 3, 0], [4, 0, 5]])
B = sparse.csr_array([[2, 0, 0], [0, 1, 0], [0, 0, 3]])

print('Matrix A:')
print(A.toarray())
print('\nMatrix B:')
print(B.toarray())
print()

# Addition
C_add = A + B
print('A + B:')
print(C_add.toarray())
print(f'Non-zeros: {C_add.nnz}\n')

# Matrix multiplication
C_mult = A @ B  # or A.dot(B)
print('A @ B (matrix multiplication):')
print(C_mult.toarray())
print(f'Non-zeros: {C_mult.nnz}\n')

# Element-wise multiplication
C_elem = A.multiply(B)
print('A .* B (element-wise):')
print(C_elem.toarray())
print(f'Non-zeros: {C_elem.nnz}')

Matrix A:
[[1 0 2]
 [0 3 0]
 [4 0 5]]

Matrix B:
[[2 0 0]
 [0 1 0]
 [0 0 3]]

A + B:
[[3 0 2]
 [0 4 0]
 [4 0 8]]
Non-zeros: 5

A @ B (matrix multiplication):
[[ 2  0  6]
 [ 0  3  0]
 [ 8  0 15]]
Non-zeros: 5

A .* B (element-wise):
[[ 2  0  0]
 [ 0  3  0]
 [ 0  0 15]]
Non-zeros: 3


## Slicing and Indexing

**CSR**: Fast row slicing
**CSC**: Fast column slicing

**Operations**:
- `A[i, :]` - row i
- `A[:, j]` - column j
- `A[i, j]` - single element
- `A[rows, :]` - multiple rows

In [3]:
# Slicing
A = sparse.csr_array(np.random.rand(5, 5))
A[A < 0.7] = 0  # Sparsify
A.eliminate_zeros()

print('Matrix A:')
print(A.toarray())
print()

# Row slicing (fast for CSR)
row_2 = A[2, :]
print(f'Row 2: {row_2.toarray()}')
print(f'Type: {type(row_2).__name__}\n')

# Column slicing (convert to CSC for speed)
A_csc = A.tocsc()
col_3 = A_csc[:, 3]
print(f'Column 3: {col_3.toarray().T}')

# Single element
element = A[1, 2]
print(f'\nA[1, 2] = {element}')

Matrix A:
[[0.         0.         0.         0.         0.78198269]
 [0.78177985 0.         0.         0.84171056 0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]]

Row 2: [0. 0. 0. 0. 0.]
Type: coo_array

Column 3: [0.         0.84171056 0.         0.         0.        ]

A[1, 2] = 0.0


  A[A < 0.7] = 0  # Sparsify


## Matrix-Vector Multiplication

**Most common operation**: A @ x

**Complexity**: O(nnz) - only non-zero elements
**Dense would be**: O(n²)

**Critical for**: PageRank, power method, iterative solvers

In [4]:
# Large sparse matrix
n = 10000
density = 0.001
A = sparse.random(n, n, density=density, format='csr')
x = np.random.rand(n)

print(f'Matrix-vector multiplication')
print(f'  Matrix: {n}×{n}, density={density*100}%')
print(f'  Non-zeros: {A.nnz:,}\n')

import time

# Sparse
start = time.time()
y_sparse = A @ x
time_sparse = time.time() - start

print(f'Sparse A @ x: {time_sparse*1000:.2f} ms')

# Dense comparison (if memory allows)
if n <= 5000:
    A_dense = A.toarray()
    start = time.time()
    y_dense = A_dense @ x
    time_dense = time.time() - start
    print(f'Dense A @ x: {time_dense*1000:.2f} ms')
    print(f'Speedup: {time_dense/time_sparse:.1f}×')

Matrix-vector multiplication
  Matrix: 10000×10000, density=0.1%
  Non-zeros: 100,000

Sparse A @ x: 0.22 ms


## Real Example: TF-IDF Document Similarity

**Problem**: Find similar documents
**Method**: Cosine similarity of TF-IDF vectors

**Sparse advantage**: Most words not in most documents

In [5]:
# Simulate TF-IDF matrix
n_docs = 1000
n_terms = 5000
avg_terms_per_doc = 50

print('Document Similarity with TF-IDF')
print(f'  Documents: {n_docs:,}')
print(f'  Vocabulary: {n_terms:,}\n')

# Build document-term matrix
np.random.seed(42)
rows, cols, data = [], [], []

for doc in range(n_docs):
    n_terms_doc = np.random.poisson(avg_terms_per_doc)
    terms = np.random.choice(n_terms, size=min(n_terms_doc, n_terms), replace=False)
    tfidf_scores = np.random.rand(len(terms))  # Simplified TF-IDF
    
    rows.extend([doc] * len(terms))
    cols.extend(terms)
    data.extend(tfidf_scores)

tfidf = sparse.csr_array((data, (rows, cols)), shape=(n_docs, n_terms))

print(f'TF-IDF matrix:')
print(f'  Shape: {tfidf.shape}')
print(f'  Non-zeros: {tfidf.nnz:,}')
print(f'  Density: {tfidf.nnz/(n_docs*n_terms)*100:.3f}%\n')

# Normalize rows (for cosine similarity)
from scipy.sparse import linalg as sp_linalg
row_norms = np.sqrt(np.array(tfidf.multiply(tfidf).sum(axis=1)).flatten())
tfidf_normalized = tfidf.copy()
tfidf_normalized.data /= np.repeat(row_norms, np.diff(tfidf.indptr))

# Find documents similar to document 0
query_doc = tfidf_normalized[0]
similarities = tfidf_normalized @ query_doc.T
similarities = np.array(similarities.toarray()).flatten()

# Top 5 similar (excluding self)
top_5 = np.argsort(similarities)[::-1][1:6]

print(f'Top 5 documents similar to Doc 0:')
for i, doc_id in enumerate(top_5, 1):
    print(f'  {i}. Doc {doc_id}: similarity={similarities[doc_id]:.4f}')

Document Similarity with TF-IDF
  Documents: 1,000
  Vocabulary: 5,000

TF-IDF matrix:
  Shape: (1000, 5000)
  Non-zeros: 49,629
  Density: 0.993%

Top 5 documents similar to Doc 0:
  1. Doc 148: similarity=0.1149
  2. Doc 799: similarity=0.1098
  3. Doc 77: similarity=0.0993
  4. Doc 636: similarity=0.0860
  5. Doc 995: similarity=0.0788


## Aggregation Operations

**Row/column operations**:
- `sum(axis)`: Sum along axis
- `mean(axis)`: Average
- `max(axis)`, `min(axis)`: Extrema

**Result**: Dense array (usually small)

In [6]:
# User-item rating matrix
n_users = 1000
n_items = 500
rating_matrix = sparse.random(n_users, n_items, density=0.05, format='csr')
rating_matrix.data = np.random.randint(1, 6, size=rating_matrix.nnz)  # 1-5 stars

print('Rating Matrix Analysis')
print(f'  Users: {n_users:,}')
print(f'  Items: {n_items:,}')
print(f'  Ratings: {rating_matrix.nnz:,}\n')

# Item statistics
item_rating_counts = np.array((rating_matrix != 0).sum(axis=0)).flatten()
item_avg_ratings = np.array(rating_matrix.sum(axis=0)).flatten() / np.maximum(item_rating_counts, 1)

print(f'Item statistics:')
print(f'  Avg ratings per item: {item_rating_counts.mean():.1f}')
print(f'  Most rated item: {item_rating_counts.max()} ratings')
print(f'  Items with no ratings: {(item_rating_counts == 0).sum()}\n')

# User statistics
user_rating_counts = np.array((rating_matrix != 0).sum(axis=1)).flatten()
print(f'User statistics:')
print(f'  Avg ratings per user: {user_rating_counts.mean():.1f}')
print(f'  Most active user: {user_rating_counts.max()} ratings')

Rating Matrix Analysis
  Users: 1,000
  Items: 500
  Ratings: 25,000

Item statistics:
  Avg ratings per item: 50.0
  Most rated item: 75 ratings
  Items with no ratings: 0

User statistics:
  Avg ratings per user: 25.0
  Most active user: 48 ratings


## Summary

### Arithmetic Operations:
```python
# Standard operations
C = A + B  # Addition
C = A - B  # Subtraction
C = A @ B  # Matrix multiplication
C = A.multiply(B)  # Element-wise
C = A * scalar  # Scalar multiplication
```

### Slicing:
```python
# CSR: fast rows
row = A[i, :]
rows = A[[i, j, k], :]

# CSC: fast columns
A_csc = A.tocsc()
col = A_csc[:, j]
```

### Aggregation:
```python
row_sums = A.sum(axis=1)  # Sum each row
col_means = A.mean(axis=0)  # Average each column
total = A.sum()  # Total sum
max_val = A.max()  # Maximum element
```

### Performance Tips:
✓ **Use CSR for rows**: Fast row slicing, matrix-vector  
✓ **Use CSC for columns**: Fast column operations  
✓ **Avoid element access**: A[i,j] is slow, use slicing  
✓ **Batch operations**: Better than loops  
✓ **Keep sparse**: Operations preserve sparsity when possible  