In [3]:
import pandas as pd
import numpy as np
import time

# 1. Vectorization
# 1.1. Built-in Pandas functions and operations
n = 10**6  # 1 million rows
df = pd.DataFrame({
    'A': np.random.rand(n),
    'B': np.random.rand(n)
})

start_time = time.time()
df['C'] = df['A'] + df['B']  # Vectorized addition
end_time = time.time()
print(f"\n1.1. Vectorized operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the vectorized operation will be printed.

# 2. Avoiding Loops
# 2.1. Pandas built-in functions and methods
start_time = time.time()
# df['D'] = 0  # Initialize outside the loop if needed, but not necessary for just timing
# for i in range(len(df)): # This is slow, avoid!
#     df['D'][i] = df['A'][i] * 2  # Non-vectorized multiplication
df['D'] = df['A'] * 2 # Vectorized approach, much faster
end_time = time.time()
print(f"\n2.1. (Corrected) Vectorized operation time: {end_time - start_time:.5f} seconds")  # Corrected to reflect vectorized time

start_time = time.time()
df['D'] = 0  # Initialize outside the loop if needed, but not necessary for just timing
for i in range(len(df)): # This is slow, avoid!
    df['D'][i] = df['A'][i] * 2  # Non-vectorized multiplication
end_time = time.time()
print(f"\n2.1. Non-vectorized operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the non-vectorized operation will be printed, and it will be significantly longer than the vectorized operation.

# 3. Using NumPy for Performance
# 3.1. Converting to NumPy arrays
start_time = time.time()
array_A = df['A'].to_numpy()
array_B = df['B'].to_numpy()
df['E'] = array_A * array_B  # Using NumPy for element-wise multiplication
end_time = time.time()
print(f"\n3.1. NumPy operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the NumPy operation will be printed.

# 4. Memory Usage Optimization
# 4.1. Appropriate data types
print("\n4.1. Memory usage before optimization:\n", df.info(memory_usage='deep'))
# Expected Output: The memory usage before optimization will be printed.

df['A'] = df['A'].astype('float32')  # Change to float32
df['B'] = df['B'].astype('float32')  # Change to float32
df['C'] = df['C'].astype('float32')  # Change to float32
df['D'] = df['D'].astype('float32')  # Change to float32
df['E'] = df['E'].astype('float32')  # Change to float32


print("\n4.1. Memory usage after optimization:\n", df.info(memory_usage='deep'))
# Expected Output: The memory usage after optimization will be printed, showing a reduction in memory consumption.

# 5. Using query() for Filtering
# 5.1. query() method
start_time = time.time()
filtered_df = df.query('A > 0.5 and B < 0.5')
end_time = time.time()
print(f"\n5.1. Query operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the query operation will be printed.

# 6. Using apply() Efficiently
# 6.1. apply() method
start_time = time.time()
df['F'] = df['A'].apply(lambda x: x * 2)  # Using apply
end_time = time.time()
print(f"\n6.1. Apply operation time: {end_time - start_time:.5f} seconds")
# Expected Output: The time taken for the apply operation will be printed, and it will be longer than the vectorized operations.

# 7. Profiling and Benchmarking
# 7.1. line_profiler, memory_profiler (Example with line_profiler - install with: pip install line_profiler)
# To use line_profiler:
# 1. Decorate the function you want to profile with @profile (you'll need to enable the line_profiler extension in your environment).
# 2. Save the code as a .py file (e.g., performance_test.py).
# 3. Run it from the command line using: kernprof -l -v performance_test.py  (This creates a performance_test.lprof file)
# 4. Then view the results with: python -m line_profiler performance_test.py.lprof

# @profile  # Uncomment to profile. Make sure to install line_profiler
def example_function():
    df['G'] = df['A'] + df['B']  # Example operation
    df['H'] = df['A'] * df['B']

# example_function() # Uncomment to run the function for profiling

print("\n7.1. Profiling example (run from command line with line_profiler):\n") # Note on how to run it
# Expected Output: Profiling results will show the time taken for each line of code.


1.1. Vectorized operation time: 0.00326 seconds

2.1. (Corrected) Vectorized operation time: 0.00305 seconds


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['D'][i] = df['A'][i] * 2  # Non-vectorized multiplication
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df


2.1. Non-vectorized operation time: 5.70325 seconds

3.1. NumPy operation time: 0.00217 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float64
 1   B       1000000 non-null  float64
 2   C       1000000 non-null  float64
 3   D       1000000 non-null  float64
 4   E       1000000 non-null  float64
dtypes: float64(5)
memory usage: 38.1 MB

4.1. Memory usage before optimization:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1000000 non-null  float32
 1   B       1000000 non-null  float32
 2   C       1000000 non-null  float32
 3   D       1000000 non-null  float32
 4   E       1000000 non-null  float32
dtypes: float32(5)
memory usage: 19.1 MB

4.1. M

In [8]:
import pandas as pd
import time
from multiprocessing import Pool

def process_row(row_tuple):
    index, row = row_tuple
    try:
        return row['A'] * 2
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        return None  # Or some other sentinel value (e.g., np.nan)

data = {
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

start_time = time.time()
with Pool(processes=4) as pool:  # You can adjust the number of processes
    results = pool.map(process_row, df.iterrows())

end_time = time.time()

df['Processed'] = results  # Assign the results back to the DataFrame

print(f"\nMultiprocessing operation time: {end_time - start_time:.5f} seconds")
print(df)


# Example with a larger DataFrame to demonstrate performance improvement:
import numpy as np

n = 100000  # Example size; adjust as needed
df_large = pd.DataFrame({'A': np.random.rand(n), 'B': np.random.rand(n)})

start_time = time.time()
with Pool(processes=4) as pool:
    results_large = pool.map(process_row, df_large.iterrows())
end_time = time.time()

df_large['Processed'] = results_large
print(f"\nMultiprocessing operation time (large DataFrame): {end_time - start_time:.5f} seconds")


# Example of a vectorized approach (for comparison):
start_time_vec = time.time()
df_large['Processed_vec'] = df_large['A'] * 2  # Vectorized operation
end_time_vec = time.time()

print(f"\nVectorized operation time (large DataFrame): {end_time_vec - start_time_vec:.5f} seconds")

Process SpawnPoolWorker-26:
Process SpawnPoolWorker-29:
Process SpawnPoolWorker-28:
Process SpawnPoolWorker-27:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/lib/python3.12/mu

KeyboardInterrupt: 