### Pandas Performance

In [1]:
import pandas as pd
import numpy as np
import time
from line_profiler import LineProfiler

# 1. Vectorization
# 1.1. Built-in Pandas functions and operations
n = 10**6  # 1 million rows
df = pd.DataFrame({
    'A': np.random.rand(n),
    'B': np.random.rand(n)
})

start_time = time.time()
df['C'] = df['A'] + df['B']  # Vectorized addition
end_time = time.time()
print(f"\n1.1. Vectorized operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the vectorized operation will be printed.

# 2. Avoiding Loops
# 2.1. Pandas built-in functions and methods
start_time = time.time()
df['D'] = df['A'] * 2 # Vectorized approach, much faster
end_time = time.time()
print(f"\n2.1. Vectorized operation time: {end_time - start_time:.5f} seconds")

start_time = time.time()
df_loop = df.copy() # Create a copy to avoid modifying the original df
df_loop['D_loop'] = 0
for i in range(len(df_loop)): # This is slow, avoid!
    df_loop['D_loop'].iloc[i] = df_loop['A'].iloc[i] * 2  # Non-vectorized multiplication (using .iloc for safer assignment)
end_time = time.time()
print(f"\n2.1. Non-vectorized operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the non-vectorized operation will be printed, and it will be significantly longer than the vectorized operation.

# 3. Using NumPy for Performance
# 3.1. Converting to NumPy arrays
start_time = time.time()
array_A = df['A'].to_numpy()
array_B = df['B'].to_numpy()
df['E'] = array_A * array_B  # Using NumPy for element-wise multiplication
end_time = time.time()
print(f"\n3.1. NumPy operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the NumPy operation will be printed.

# 4. Memory Usage Optimization
# 4.1. Appropriate data types
print("\n4.1. Memory usage before optimization:\n", df.info(memory_usage='deep'))
# Expected Output: The memory usage before optimization will be printed.

df['A'] = df['A'].astype('float32')  # Change to float32
df['B'] = df['B'].astype('float32')  # Change to float32
df['C'] = df['C'].astype('float32')  # Change to float32
df['D'] = df['D'].astype('float32')  # Change to float32
df['E'] = df['E'].astype('float32')  # Change to float32


print("\n4.1. Memory usage after optimization:\n", df.info(memory_usage='deep'))
# Expected Output: The memory usage after optimization will be printed, showing a reduction in memory consumption.

# 5. Using query() for Filtering
# 5.1. query() method
start_time = time.time()
filtered_df = df.query('A > 0.5 and B < 0.5')
end_time = time.time()
print(f"\n5.1. Query operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the query operation will be printed.

# 6. Using apply() Efficiently
# 6.1. apply() method (demonstrating a scenario where it might be necessary)
def complex_operation(row):
    if row['A'] > 0.5 and row['B'] < 0.3:
        return row['A']**2 + np.log(row['B'] + 0.1)
    else:
        return row['A'] - row['B']

start_time = time.time()
df['F'] = df.apply(complex_operation, axis=1)  # Using apply for row-wise complex logic
end_time = time.time()
print(f"\n6.1. Apply operation time (complex logic): {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the apply operation will be printed.

# 7. Profiling and Benchmarking
# 7.1. line_profiler, memory_profiler (Example with line_profiler - install with: pip install line_profiler)
# To use line_profiler:
# 1. Save the code as a .py file (e.g., performance_test.py).
# 2. Decorate the function you want to profile with @profile (you'll need to install line_profiler: pip install line_profiler).
# 3. Run it from the command line using: kernprof -l performance_test.py
# 4. Then view the results with: python -m line_profiler performance_test.py.lprof

def example_function_to_profile(df_profile):
    df_profile['G'] = df_profile['A'] + df_profile['B']  # Example operation
    df_profile['H'] = df_profile['A'] * df_profile['B']
    return df_profile

if __name__ == "__main__":
    n_profile = 10**5
    df_profile = pd.DataFrame({
        'A': np.random.rand(n_profile),
        'B': np.random.rand(n_profile)
    })

    lp = LineProfiler()
    lp_wrapper = lp(example_function_to_profile)
    lp_wrapper(df_profile.copy()) # Pass a copy to avoid modifying the original df
    lp.print_stats()

    print("\n7.1. Profiling example (ran using line_profiler):\n") # Note on how to run it
# Expected Output: Profiling results will show the time taken for each line of code when run with line_profiler.


1.1. Vectorized operation time: 0.00240 seconds

2.1. Vectorized operation time: 0.00151 seconds


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_loop['D_loop'].iloc[i] = df_loop['A'].iloc[i] * 2  # Non-vectorized multiplication (using .iloc for safer assignment)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/


2.1. Non-vectorized operation time: 20.38242 seconds

3.1. NumPy operation time: 0.00232 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float64
 1   B       1000000 non-null  float64
 2   C       1000000 non-null  float64
 3   D       1000000 non-null  float64
 4   E       1000000 non-null  float64
dtypes: float64(5)
memory usage: 38.1 MB

4.1. Memory usage before optimization:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float32
 1   B       1000000 non-null  float32
 2   C       1000000 non-null  float32
 3   D       1000000 non-null  float32
 4   E       1000000 non-null  float32
dtypes: float32(5)
memory usage: 19.1 MB

4.1. 

In [2]:
import pandas as pd
import numpy as np
import time

# 1. Vectorization
n = 10**6
df = pd.DataFrame({'A': np.random.rand(n), 'B': np.random.rand(n)})
start_time = time.time()
df['C'] = df['A'] + df['B']
end_time = time.time()
print(f"Vectorized addition time: {end_time - start_time:.5f} seconds")

# 2. Avoiding Loops
start_time = time.time()
df['D'] = df['A'] * 2
end_time = time.time()
print(f"Vectorized multiplication time: {end_time - start_time:.5f} seconds")

df_loop_slow = df.copy()
start_time = time.time()
df_loop_slow['D_loop_slow'] = 0.0  # Initialize with float dtype
for i in range(len(df_loop_slow)):
    df_loop_slow.loc[i, 'D_loop_slow'] = df_loop_slow.loc[i, 'A'] * 2
end_time = time.time()
print(f"Non-vectorized multiplication time: {end_time - start_time:.5f} seconds")

# 3. Using NumPy
array_A = df['A'].to_numpy()
array_B = df['B'].to_numpy()
start_time = time.time()
df['E'] = array_A * array_B
end_time = time.time()
print(f"NumPy operation time: {end_time - start_time:.5f} seconds")

# 4. Memory Usage Optimization
print("\nMemory usage before optimization:\n", df.info(memory_usage='deep'))
df['A'] = df['A'].astype('float32')
df['B'] = df['B'].astype('float32')
df['C'] = df['C'].astype('float32')
df['D'] = df['D'].astype('float32')
df['E'] = df['E'].astype('float32')
print("\nMemory usage after optimization:\n", df.info(memory_usage='deep'))

# 5. Using query()
start_time = time.time()
filtered_df = df.query('A > 0.5 and B < 0.5')
end_time = time.time()
print(f"Query operation time: {end_time - start_time:.5f} seconds")

# 6. Using apply() (example with complex logic)
def complex_operation(row):
    if row['A'] > 0.5 and row['B'] < 0.3:
        return row['A']**2 + np.log(row['B'] + 0.1)
    else:
        return row['A'] - row['B']
start_time = time.time()
df['F'] = df.apply(complex_operation, axis=1)
end_time = time.time()
print(f"Apply operation time (complex logic): {end_time - start_time:.5f} seconds")

Vectorized addition time: 0.00173 seconds
Vectorized multiplication time: 0.00119 seconds
Non-vectorized multiplication time: 42.66623 seconds
NumPy operation time: 0.00185 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float64
 1   B       1000000 non-null  float64
 2   C       1000000 non-null  float64
 3   D       1000000 non-null  float64
 4   E       1000000 non-null  float64
dtypes: float64(5)
memory usage: 38.1 MB

Memory usage before optimization:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float32
 1   B       1000000 non-null  float32
 2   C       1000000 non-null  float32
 3   D       1000000 non-null  float32
 4   E  