# Exercise 05 : Pandas optimizations

In [32]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [33]:
df = pd.read_csv('../data/fines.csv')

## iterations: in all the following subtasks, you need to calculate fines/refund*year for<br>each row and create a new column with the calculated data and measure the time<br>using the magic command %%timeit in the cell

- loop: write a function that iterates through the dataframe using for i in<br>range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [34]:
%%timeit
new_col = []
for i in range(len(df)):
    new_col.append(df.iloc[i]['Fines'] / (df.iloc[i]['Refund'] * df.iloc[i]['Year']))
df['calculate_column'] = new_col

293 ms ± 3.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
df['calculate_column']

0     0.80
1     3.26
2     1.06
3     0.50
4     2.83
      ... 
925   0.25
926   3.77
927   0.25
928   2.17
929   0.50
Name: calculate_column, Length: 930, dtype: float64

- do it using iterrows()

In [36]:
%%time
new_col = []
for i, row in df.iterrows():
    new_col.append(row['Fines'] / (row['Refund'] * row['Year']))
df['calculate_column'] = new_col

CPU times: user 47.1 ms, sys: 1.31 ms, total: 48.4 ms
Wall time: 47.8 ms


In [37]:
df['calculate_column']

0     0.80
1     3.26
2     1.06
3     0.50
4     2.83
      ... 
925   0.25
926   3.77
927   0.25
928   2.17
929   0.50
Name: calculate_column, Length: 930, dtype: float64

- do it using apply() and lambda function

In [38]:
%%time
df['calculate_column'] = df.apply(lambda row: row['Fines'] / (row['Refund'] * row['Year']), axis=1)

CPU times: user 15.1 ms, sys: 1.12 ms, total: 16.2 ms
Wall time: 15.9 ms


In [39]:
df['calculate_column']

0     0.80
1     3.26
2     1.06
3     0.50
4     2.83
      ... 
925   0.25
926   3.77
927   0.25
928   2.17
929   0.50
Name: calculate_column, Length: 930, dtype: float64

- do it using Series objects from the dataframe

In [40]:
%%time
df['calculate_column'] = df['Fines'] / (df['Refund'] * df['Year'])

CPU times: user 840 µs, sys: 536 µs, total: 1.38 ms
Wall time: 10 ms


In [41]:
df['calculate_column']

0     0.80
1     3.26
2     1.06
3     0.50
4     2.83
      ... 
925   0.25
926   3.77
927   0.25
928   2.17
929   0.50
Name: calculate_column, Length: 930, dtype: float64

- do it as in the previous subtask but with the method .values

In [42]:
%%time
df['calculate_column'] = df['Fines'].values / (df['Refund'].values * df['Year'].values)

CPU times: user 770 µs, sys: 359 µs, total: 1.13 ms
Wall time: 851 µs


In [43]:
df['calculate_column']

0     0.80
1     3.26
2     1.06
3     0.50
4     2.83
      ... 
925   0.25
926   3.77
927   0.25
928   2.17
929   0.50
Name: calculate_column, Length: 930, dtype: float64

## indexing: measure the time using the magic command %%timeit in the cell

- get a row for a specific CarNumber, for example, ’O136HO197RUS’

In [44]:
%%time
df.loc[df['CarNumber'] == 'O136HO197RUS']

CPU times: user 808 µs, sys: 693 µs, total: 1.5 ms
Wall time: 2.22 ms


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,calculate_column
715,O136HO197RUS,2,7800.0,Toyota,Corolla,1999,1.95
902,O136HO197RUS,2,7800.0,Toyota,Corolla,1991,1.96


- set the index in your dataframe with CarNumber

In [45]:
%%time
df = df.set_index('CarNumber')

CPU times: user 844 µs, sys: 714 µs, total: 1.56 ms
Wall time: 3.23 ms


- again, get a row for the same CarNumber

In [46]:
%%time
df.loc['O136HO197RUS']

CPU times: user 909 µs, sys: 543 µs, total: 1.45 ms
Wall time: 3.34 ms


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
O136HO197RUS,2,7800.0,Toyota,Corolla,1999,1.95
O136HO197RUS,2,7800.0,Toyota,Corolla,1991,1.96


## downcasting:

- run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [47]:
%%time
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to T171CC96RUS
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    int64  
 1   Fines             930 non-null    float64
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int64  
 5   calculate_column  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 236.0 KB
CPU times: user 6.49 ms, sys: 4.41 ms, total: 10.9 ms
Wall time: 15.2 ms


- make a copy() of your initial dataframe into another dataframe optimized

In [48]:
df_copy = df.copy()
df_copy

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.00,Ford,Focus,1989,0.80
E432XX77RUS,1,6500.00,Toyota,Camry,1995,3.26
7184TT36RUS,1,2100.00,Ford,Focus,1984,1.06
X582HE161RUS,2,2000.00,Ford,Focus,2015,0.50
92918M178RUS,1,5700.00,Ford,Focus,2014,2.83
...,...,...,...,...,...,...
C584EY154RUS,1,500.00,Ford,Focus,2014,0.25
T395KX197RUS,2,15200.00,Ford,Focus,2016,3.77
8441XX154RUS,1,500.00,Ford,Focus,1994,0.25
C590EY154RUS,2,8594.60,Ford,Focus,1982,2.17


- downcast from float64 to float32 for all the columns

In [49]:
df_copy['Fines'] = df_copy['Fines'].astype("float32")
df_copy['calculate_column'] = df_copy['calculate_column'].astype("float32")
df_copy

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.00,Ford,Focus,1989,0.80
E432XX77RUS,1,6500.00,Toyota,Camry,1995,3.26
7184TT36RUS,1,2100.00,Ford,Focus,1984,1.06
X582HE161RUS,2,2000.00,Ford,Focus,2015,0.50
92918M178RUS,1,5700.00,Ford,Focus,2014,2.83
...,...,...,...,...,...,...
C584EY154RUS,1,500.00,Ford,Focus,2014,0.25
T395KX197RUS,2,15200.00,Ford,Focus,2016,3.77
8441XX154RUS,1,500.00,Ford,Focus,1994,0.25
C590EY154RUS,2,8594.60,Ford,Focus,1982,2.17


-  downcast from int64 to the smallest numerical dtype possible

In [50]:
df_copy['Refund'] = pd.to_numeric(df_copy['Refund'], downcast='integer')
df_copy['Year'] = pd.to_numeric(df_copy['Year'], downcast='integer')

-  run info(memory_usage=’deep’) for your new dataframe, pay attention to the<br>Dtype and the memory usage

In [51]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to T171CC96RUS
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    int8   
 1   Fines             930 non-null    float32
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int16  
 5   calculate_column  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 216.9 KB


## categories:

- change the object type columns to the type category

In [52]:
df_copy.reset_index(inplace=True)

In [53]:
df_copy.reset_index()
df_copy['CarNumber'] = df_copy['CarNumber'].astype('category')
df_copy['Make'] = df_copy['Make'].astype('category')
df_copy['Model'] = df_copy['Model'].astype('category')

- This time, check the memory usage, it probably has a decrease of 2-3 times<br>compared to the initial dataframe

In [54]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   CarNumber         930 non-null    category
 1   Refund            930 non-null    int8    
 2   Fines             930 non-null    float32 
 3   Make              930 non-null    category
 4   Model             919 non-null    category
 5   Year              930 non-null    int16   
 6   calculate_column  930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 67.2 KB


## memory clean

- using %reset_selective and the library gc clean the memory of your initial<br>dataframe only

In [55]:
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.00,Ford,Focus,1989,0.80
E432XX77RUS,1,6500.00,Toyota,Camry,1995,3.26
7184TT36RUS,1,2100.00,Ford,Focus,1984,1.06
X582HE161RUS,2,2000.00,Ford,Focus,2015,0.50
92918M178RUS,1,5700.00,Ford,Focus,2014,2.83
...,...,...,...,...,...,...
C584EY154RUS,1,500.00,Ford,Focus,2014,0.25
T395KX197RUS,2,15200.00,Ford,Focus,2016,3.77
8441XX154RUS,1,500.00,Ford,Focus,1994,0.25
C590EY154RUS,2,8594.60,Ford,Focus,1982,2.17


In [56]:
%reset_selective -f df

In [57]:
df

NameError: name 'df' is not defined