### Sparse data

In [8]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sys

# 1. Sparse Arrays
print("--- 1. Sparse Arrays ---")
dense_array = np.array([0, 0, 1, 0, 2, 0, 0, 3])
sparse_array = pd.arrays.SparseArray(dense_array)
print("Sparse Array:\n", sparse_array)
print("Sparse Array dtype:", sparse_array.dtype)
print("Sparse Array sp_values:", sparse_array.sp_values)
print("Sparse Array fill_value:", sparse_array.fill_value)

sparse_array_nan = pd.arrays.SparseArray([np.nan, 1, np.nan, 2], fill_value=np.nan)
print("\nSparse Array with NaN fill_value:\n", sparse_array_nan)
print("Sparse Array fill_value:", sparse_array_nan.fill_value)

# 2. Sparse Series
print("\n--- 2. Sparse Series ---")
dense_series = pd.Series([0, 0, 1, 0, 2, 0])
sparse_series = dense_series.astype(pd.SparseDtype(int, fill_value=0))
print("Sparse Series from dense:\n", sparse_series)
print("Sparse Series dtype:", sparse_series.dtype)
print("Sparse Series sparse_accessor.fill_value:", sparse_series.sparse.fill_value)
print("Sparse Series sparse_accessor.sp_values:\n", sparse_series.sparse.sp_values)
print("Positions of non-fill values:\n", sparse_series[sparse_series != sparse_series.sparse.fill_value].index)

sparse_series_direct = pd.Series([1, 0, 2, 0, 3], dtype=pd.SparseDtype(int, fill_value=0))
print("\nDirectly created Sparse Series:\n", sparse_series_direct)

# 3. Sparse DataFrame
print("\n--- 3. Sparse DataFrame ---")
sparse_df = pd.DataFrame({
    'A': pd.arrays.SparseArray([0, 1, 0, 0, 2], fill_value=0),
    'B': pd.Series([0, 0, 3, 0, 0], dtype=pd.SparseDtype(int, fill_value=0))
})
print("Sparse DataFrame from sparse arrays/series:\n", sparse_df)
print("Sparse DataFrame dtypes:\n", sparse_df.dtypes)

dense_df = pd.DataFrame([[0, 1, 0], [0, 0, 2], [3, 0, 0]], columns=['X', 'Y', 'Z'])
sparse_df_from_dense = dense_df.astype(pd.SparseDtype(int, fill_value=0))
print("\nSparse DataFrame from dense DataFrame:\n", sparse_df_from_dense)
print("Sparse DataFrame dtypes:\n", sparse_df_from_dense.dtypes)

# 4. Creating Sparse Data Structures (More examples)
print("\n--- 4. Creating Sparse Data Structures (More examples) ---")
sparse_array_explicit = pd.arrays.SparseArray([1, 0, 2, 0, 3], fill_value=0)
print("Explicit SparseArray:\n", sparse_array_explicit)

sparse_series_constructor = pd.Series([10, np.nan, 20, np.nan], dtype=pd.SparseDtype(float, fill_value=np.nan))
print("\nSparse Series constructor:\n", sparse_series_constructor)

sparse_df_dict = pd.DataFrame({
    'col1': pd.Series([1, 0, 0, 2], dtype=pd.SparseDtype(int, fill_value=0)),
    'col2': [5, 0, 6, 0]  # mixed sparse and dense
})
print("\nSparse DataFrame with mixed columns:\n", sparse_df_dict)

# 5. Sparse Dtypes
print("\n--- 5. Sparse Dtypes ---")
sparse_dtype_int = pd.SparseDtype(int, fill_value=0)
sparse_dtype_float_nan = pd.SparseDtype(float, fill_value=np.nan)

sparse_series_typed = pd.Series([1, 0, 2, 0], dtype=sparse_dtype_int)
print("Sparse Series with explicit int dtype:\n", sparse_series_typed)
print("Sparse Series dtype:", sparse_series_typed.dtype)

sparse_series_typed_nan = pd.Series([1.0, np.nan, 2.0, np.nan], dtype=sparse_dtype_float_nan)
print("\nSparse Series with explicit float NaN dtype:\n", sparse_series_typed_nan)
print("Sparse Series dtype:", sparse_series_typed_nan.dtype)

# 6. Operations on Sparse Data
print("\n--- 6. Operations on Sparse Data ---")
s1 = pd.Series([1, 0, 2, 0], dtype=pd.SparseDtype(int, fill_value=0))
s2 = pd.Series([1, 2, 3, 4])
print("Sparse Series s1:\n", s1)
print("Dense Series s2:\n", s2)
print("s1 + s2:\n", s1 + s2)
print("s1 * 2:\n", s1 * 2)

sparse_df_op1 = pd.DataFrame({'A': pd.Series([1, 0, 2], dtype=pd.SparseDtype(int, fill_value=0))})
sparse_df_op2 = pd.DataFrame({'A': pd.Series([3, 0, 1], dtype=pd.SparseDtype(int, fill_value=0))})
print("\nSparse DataFrame op1:\n", sparse_df_op1)
print("Sparse DataFrame op2:\n", sparse_df_op2)
print("sparse_df_op1 + sparse_df_op2:\n", sparse_df_op1 + sparse_df_op2)

# 7. Indexing and Selection
print("\n--- 7. Indexing and Selection ---")
sparse_s_index = pd.Series([10, 0, 20, 0, 30], index=['a', 'b', 'c', 'd', 'e'], dtype=pd.SparseDtype(int, fill_value=0))
print("Sparse Series with index:\n", sparse_s_index)
print("sparse_s_index['c']:", sparse_s_index['c'])
print("sparse_s_index[sparse_s_index != 0]:\n", sparse_s_index[sparse_s_index != 0])
print("sparse_s_index[['a', 'e']]:\n", sparse_s_index[['a', 'e']])

# 8. Memory Usage (Comparison with sparse matrix from scipy)
print("\n--- 8. Memory Usage Comparison ---")
sparse_matrix = csr_matrix([[1, 0, 0, 4], [0, 0, 3, 0], [0, 2, 0, 0]])
df_from_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=['A', 'B', 'C', 'D'])
sparse_matrix_size = sys.getsizeof(sparse_matrix)
sparse_dataframe_size = sys.getsizeof(df_from_sparse)
print(f"Size of the sparse matrix: {sparse_matrix_size} bytes")
print(f"Size of the sparse dataframe: {sparse_dataframe_size} bytes")

dense_df_comparison = pd.DataFrame(sparse_matrix.toarray(), columns=['A', 'B', 'C', 'D'])
dense_dataframe_size = sys.getsizeof(dense_df_comparison)
print(f"Size of the dense dataframe: {dense_dataframe_size} bytes")

sparse_df_comparison = dense_df_comparison.astype(pd.SparseDtype(int, fill_value=0))
sparse_df_comparison_size = sys.getsizeof(sparse_df_comparison)
print(f"Size of the sparse dataframe (from dense): {sparse_df_comparison_size} bytes")

# 9. Sparse Arrays (Revisited with direct creation)
print("\n--- 9. Sparse Arrays (Revisited with direct creation) ---")
sparse_array_direct_creation = pd.arrays.SparseArray([1, 0, 0, 2, 0], fill_value=0)
print("Directly created SparseArray:\n", sparse_array_direct_creation)

# 10. When to Use (Illustrative Example)
print("\n--- 10. When to Use (Illustrative Example) ---")
large_sparse_data = np.random.choice([0, 1, 2, 3], size=(1000, 1000), p=[0.95, 0.02, 0.02, 0.01])
large_dense_df = pd.DataFrame(large_sparse_data)
large_sparse_df = large_dense_df.astype(pd.SparseDtype(int, fill_value=0))

print(f"Size of large dense DataFrame: {sys.getsizeof(large_dense_df)} bytes")
print(f"Size of large sparse DataFrame: {sys.getsizeof(large_sparse_df)} bytes")
print("Note the potential significant memory saving for highly sparse data.")

# 11. Limitations (Illustrative Example - Low Sparsity)
print("\n--- 11. Limitations (Illustrative Example - Low Sparsity) ---")
low_sparse_data = np.random.randint(1, 5, size=(100, 100))
low_sparse_df = pd.DataFrame(low_sparse_data).astype(pd.SparseDtype(int, fill_value=0))
low_dense_df = pd.DataFrame(low_sparse_data)

print(f"Size of low sparsity dense DataFrame: {sys.getsizeof(low_dense_df)} bytes")
print(f"Size of low sparsity sparse DataFrame: {sys.getsizeof(low_sparse_df)} bytes")
print("For low sparsity, the sparse representation might not save much or could even be larger.")


--- 1. Sparse Arrays ---
Sparse Array:
 [0, 0, 1, 0, 2, 0, 0, 3]
Fill: 0
IntIndex
Indices: array([2, 4, 7], dtype=int32)

Sparse Array dtype: Sparse[int64, 0]
Sparse Array sp_values: [1 2 3]
Sparse Array fill_value: 0

Sparse Array with NaN fill_value:
 [nan, 1.0, nan, 2.0]
Fill: nan
IntIndex
Indices: array([1, 3], dtype=int32)

Sparse Array fill_value: nan

--- 2. Sparse Series ---
Sparse Series from dense:
 0    0
1    0
2    1
3    0
4    2
5    0
dtype: Sparse[int64, 0]
Sparse Series dtype: Sparse[int64, 0]
Sparse Series sparse_accessor.fill_value: 0
Sparse Series sparse_accessor.sp_values:
 [1 2]
Positions of non-fill values:
 Index([2, 4], dtype='int64')

Directly created Sparse Series:
 0    1
1    0
2    2
3    0
4    3
dtype: Sparse[int64, 0]

--- 3. Sparse DataFrame ---
Sparse DataFrame from sparse arrays/series:
    A  B
0  0  0
1  1  0
2  0  3
3  0  0
4  2  0
Sparse DataFrame dtypes:
 A    Sparse[int64, 0]
B    Sparse[int64, 0]
dtype: object

Sparse DataFrame from dense Dat