# Performance Optimization

In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'a': np.random.randint(1, 100, 1_000),
    'b': np.random.randint(1, 100, 1_000)
})

df.head()

Unnamed: 0,a,b
0,57,68
1,30,25
2,91,98
3,78,6
4,99,89


## Vectorization

In [3]:
df['sum_vec'] = df['a'] + df['b']
df.head()

Unnamed: 0,a,b,sum_vec
0,57,68,125
1,30,25,55
2,91,98,189
3,78,6,84
4,99,89,188


## Avoiding loops

In [4]:
df['sum_apply'] = df.apply(lambda r: r['a'] + r['b'], axis=1)
df.head()

Unnamed: 0,a,b,sum_vec,sum_apply
0,57,68,125,125
1,30,25,55,55
2,91,98,189,189
3,78,6,84,84
4,99,89,188,188


## Efficient data types

In [5]:
df.dtypes

a            int32
b            int32
sum_vec      int32
sum_apply    int32
dtype: object

In [6]:
df['a'] = df['a'].astype('int16')
df['b'] = df['b'].astype('int16')
df.dtypes

a            int16
b            int16
sum_vec      int32
sum_apply    int32
dtype: object

## Chunk processing

In [7]:
chunks = pd.read_csv(
    'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv',
    chunksize=50
)

total = 0
for chunk in chunks:
    total += chunk['total_bill'].sum()

total

np.float64(4827.77)

## Memory optimization

In [8]:
df.memory_usage(deep=True)

Index         132
a            2000
b            2000
sum_vec      4000
sum_apply    4000
dtype: int64

In [9]:
df['a'] = pd.to_numeric(df['a'], downcast='integer')
df['b'] = pd.to_numeric(df['b'], downcast='integer')
df.memory_usage(deep=True)

Index         132
a            1000
b            1000
sum_vec      4000
sum_apply    4000
dtype: int64

## Copy vs view

In [10]:
df_slice = df[['a', 'b']]
df_slice.head()

Unnamed: 0,a,b
0,57,68
1,30,25
2,91,98
3,78,6
4,99,89


In [11]:
df_copy = df[['a', 'b']].copy()
df_copy.head()

Unnamed: 0,a,b
0,57,68
1,30,25
2,91,98
3,78,6
4,99,89
