In [3]:

from kazuri import kazuri
kazuri("create a python function with parallelism that takes a dataframe and gets groupby sums for all columns on column 'msisdn' add prefix as '_sum'")

Code extracted and inserted into new cell.
Check cell below for generated code


In [None]:
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def parallel_groupby_sum(df: pd.DataFrame, num_processes: int = 4) -> pd.DataFrame:
    """
    Perform parallel groupby sum operation on all columns of a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame.
        num_processes (int): Number of parallel processes to use.

    Returns:
        pd.DataFrame: Resulting DataFrame with groupby sums and '_sum' prefix.
    """
    def _groupby_sum_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
        return chunk.groupby('msisdn').sum().add_prefix('_sum')

    # Split the DataFrame into chunks
    chunks = np.array_split(df, num_processes)

    # Perform parallel processing
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        results = list(executor.map(_groupby_sum_chunk, chunks))

    # Combine results
    result_df = pd.concat(results)

    # Aggregate the combined results
    final_result = result_df.groupby(level=0).sum()

    return final_result

# Example usage
if __name__ == "__main__":
    # Create a sample DataFrame
    data = {
        'msisdn': ['A', 'B', 'A', 'C', 'B', 'C'],
        'value1': [1, 2, 3, 4, 5, 6],
        'value2': [10, 20, 30, 40, 50, 60]
    }
    df = pd.DataFrame(data)

    # Call the function
    result = parallel_groupby_sum(df)
    print(result)