### Efficiency Examples in Data Engineering

This notebook explains some efficiency techniques for data engineering process:

- Optimize data processing with Pandas
- Implement batch processing
- Use code profiling for resource usage checking 

## Optimize Data Processing with Pandas

### Before Optimization

In [None]:
import pandas as pd

df = pd.read_csv("sales_data.csv")

total_sales = {}
for index, row in df.iterrows():
    customer_id = row["customer_id"]
    sales = row["quantity"] * row["price"]
    if customer_id in total_sales:
        total_sales[customer_id] += sales
    else:
        total_sales[customer_id] = sales

result = pd.DataFrame(list(total_sales.items()), columns=["customer_id", "total_sales"])

print(result)

### After Optimization

In [None]:
import pandas as pd

df = pd.read_csv("sales_data.csv")

df["sales"] = df["quantity"] * df["price"]
result = df.groupby("customer_id", as_index=False)["sales"].sum()

print(result)

## Implement Batch Processing

### Before Batch Processing

In [None]:
import pandas as pd

df = pd.read_csv("transactions.csv")

total_amount = df["amount"].sum()

print(f"Total amount (without batch processing): {total_amount}")

### Using Batch Processing

In [None]:
import pandas as pd

# Function to process data in batches
def process_data_in_batches(file_path, batch_size):
    total_amount = 0
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        total_amount += chunk["amount"].sum()
    return total_amount


# Batch processing: processing data in batches of 3 rows
file_path = "transactions.csv"
batch_size = 3
total_amount = process_data_in_batches(file_path, batch_size)

print(f"Total amount (with batch processing): {total_amount}")

## Use Code Profiling

### Code Profiling Example 1

In [None]:
import pandas as pd
import cProfile
import pstats


def load_data(file_path):
    df = pd.read_csv(file_path)
    return df


def calculate_total_amount(df):
    total_amount = df["amount"].sum()
    return total_amount


def main():
    file_path = "transactions.csv"
    df = load_data(file_path)
    total_amount = calculate_total_amount(df)
    print(f"Total amount: {total_amount}")


if __name__ == "__main__":
    profiler = cProfile.Profile()
    profiler.enable()

    main()

    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats("cumtime")
    stats.print_stats()

### Code Profiling Example 2

In [None]:
import pandas as pd
import cProfile
import pstats


def process_data_in_batches(file_path, batch_size):
    total_amount = 0
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        total_amount += chunk["amount"].sum()
    return total_amount


def main():
    file_path = "transactions.csv"
    batch_size = 3
    total_amount = process_data_in_batches(file_path, batch_size)
    print(f"Total amount: {total_amount}")


if __name__ == "__main__":
    profiler = cProfile.Profile()
    profiler.enable()

    main()

    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats("cumtime")
    stats.print_stats()