# Exercise 05 : Pandas optimizations

In [1]:
import pandas as pd
import gc

## read the fines.csv that you saved in the previous exercise

In [2]:
df = pd.read_csv('../data/fines.csv')

## iterations: in all the following subtasks, you need to calculate fines/refund*year for<br>each row and create a new column with the calculated data and measure the time<br>using the magic command %%timeit in the cell

- loop: write a function that iterates through the dataframe using for i in<br>range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [3]:
%%timeit
new_col = []
for i in range(len(df)):
    new_col.append(df.iloc[i]['Fines'] / (df.iloc[i]['Refund'] * df.iloc[i]['Year']))
df['calculate_column'] = new_col

344 ms ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
df['calculate_column']

0      0.804424
1      3.258145
2      1.058468
3      0.496278
4      2.830189
         ...   
925    0.248262
926    3.769841
927    0.250752
928    2.168163
929    0.495786
Name: calculate_column, Length: 930, dtype: float64

- do it using iterrows()

In [5]:
%%time
new_col = []
for i, row in df.iterrows():
    new_col.append(row['Fines'] / (row['Refund'] * row['Year']))
df['calculate_column'] = new_col

CPU times: user 51 ms, sys: 1.86 ms, total: 52.9 ms
Wall time: 58 ms


In [6]:
df['calculate_column']

0      0.804424
1      3.258145
2      1.058468
3      0.496278
4      2.830189
         ...   
925    0.248262
926    3.769841
927    0.250752
928    2.168163
929    0.495786
Name: calculate_column, Length: 930, dtype: float64

- do it using apply() and lambda function

In [7]:
%%time
df['calculate_column'] = df.apply(lambda row: row['Fines'] / (row['Refund'] * row['Year']), axis=1)

CPU times: user 16.3 ms, sys: 951 µs, total: 17.3 ms
Wall time: 17.6 ms


In [8]:
df['calculate_column']

0      0.804424
1      3.258145
2      1.058468
3      0.496278
4      2.830189
         ...   
925    0.248262
926    3.769841
927    0.250752
928    2.168163
929    0.495786
Name: calculate_column, Length: 930, dtype: float64

- do it using Series objects from the dataframe

In [9]:
%%time
df['calculate_column'] = df['Fines'] / (df['Refund'] * df['Year'])

CPU times: user 1.67 ms, sys: 746 µs, total: 2.41 ms
Wall time: 5.15 ms


In [10]:
df['calculate_column']

0      0.804424
1      3.258145
2      1.058468
3      0.496278
4      2.830189
         ...   
925    0.248262
926    3.769841
927    0.250752
928    2.168163
929    0.495786
Name: calculate_column, Length: 930, dtype: float64

- do it as in the previous subtask but with the method .values

In [11]:
%%time
df['calculate_column'] = df['Fines'].values / (df['Refund'].values * df['Year'].values)

CPU times: user 1.11 ms, sys: 231 µs, total: 1.34 ms
Wall time: 1.18 ms


In [12]:
df['calculate_column']

0      0.804424
1      3.258145
2      1.058468
3      0.496278
4      2.830189
         ...   
925    0.248262
926    3.769841
927    0.250752
928    2.168163
929    0.495786
Name: calculate_column, Length: 930, dtype: float64

## indexing: measure the time using the magic command %%timeit in the cell

- get a row for a specific CarNumber, for example, ’O136HO197RUS’

In [13]:
%%time
df.loc[df['CarNumber'] == 'O136HO197RUS']

CPU times: user 736 µs, sys: 53 µs, total: 789 µs
Wall time: 776 µs


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,calculate_column
715,O136HO197RUS,2,7800.0,Toyota,Corolla,1999,1.950975
902,O136HO197RUS,2,7800.0,Toyota,Corolla,1991,1.958815


- set the index in your dataframe with CarNumber

In [14]:
%%time
df = df.set_index('CarNumber')

CPU times: user 1.19 ms, sys: 392 µs, total: 1.58 ms
Wall time: 1.28 ms


- again, get a row for the same CarNumber

In [15]:
%%time
df.loc['O136HO197RUS']

CPU times: user 995 µs, sys: 206 µs, total: 1.2 ms
Wall time: 1.77 ms


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
O136HO197RUS,2,7800.0,Toyota,Corolla,1999,1.950975
O136HO197RUS,2,7800.0,Toyota,Corolla,1991,1.958815


## downcasting:

- run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [16]:
%%time
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to T171CC96RUS
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    int64  
 1   Fines             930 non-null    float64
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int64  
 5   calculate_column  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 236.0 KB
CPU times: user 6.09 ms, sys: 653 µs, total: 6.74 ms
Wall time: 6.85 ms


- make a copy() of your initial dataframe into another dataframe optimized

In [17]:
df_copy = df.copy()
df_copy

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,0.804424
E432XX77RUS,1,6500.0,Toyota,Camry,1995,3.258145
7184TT36RUS,1,2100.0,Ford,Focus,1984,1.058468
X582HE161RUS,2,2000.0,Ford,Focus,2015,0.496278
92918M178RUS,1,5700.0,Ford,Focus,2014,2.830189
...,...,...,...,...,...,...
C584EY154RUS,1,500.0,Ford,Focus,2014,0.248262
T395KX197RUS,2,15200.0,Ford,Focus,2016,3.769841
8441XX154RUS,1,500.0,Ford,Focus,1994,0.250752
C590EY154RUS,2,8594.6,Ford,Focus,1982,2.168163


- downcast from float64 to float32 for all the columns

In [18]:
df_copy['Fines'] = df_copy['Fines'].astype("float32")
df_copy['calculate_column'] = df_copy['calculate_column'].astype("float32")
df_copy

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,0.804424
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,3.258145
7184TT36RUS,1,2100.000000,Ford,Focus,1984,1.058468
X582HE161RUS,2,2000.000000,Ford,Focus,2015,0.496278
92918M178RUS,1,5700.000000,Ford,Focus,2014,2.830189
...,...,...,...,...,...,...
C584EY154RUS,1,500.000000,Ford,Focus,2014,0.248262
T395KX197RUS,2,15200.000000,Ford,Focus,2016,3.769841
8441XX154RUS,1,500.000000,Ford,Focus,1994,0.250752
C590EY154RUS,2,8594.599609,Ford,Focus,1982,2.168164


-  downcast from int64 to the smallest numerical dtype possible

In [19]:
df_copy['Refund'] = pd.to_numeric(df_copy['Refund'], downcast='integer')
df_copy['Year'] = pd.to_numeric(df_copy['Year'], downcast='integer')

-  run info(memory_usage=’deep’) for your new dataframe, pay attention to the<br>Dtype and the memory usage

In [20]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to T171CC96RUS
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Refund            930 non-null    int8   
 1   Fines             930 non-null    float32
 2   Make              930 non-null    object 
 3   Model             919 non-null    object 
 4   Year              930 non-null    int16  
 5   calculate_column  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 216.9 KB


## categories:

- change the object type columns to the type category

In [21]:
df_copy.reset_index(inplace=True)

In [22]:
df_copy.reset_index()
df_copy['CarNumber'] = df_copy['CarNumber'].astype('category')
df_copy['Make'] = df_copy['Make'].astype('category')
df_copy['Model'] = df_copy['Model'].astype('category')

- This time, check the memory usage, it probably has a decrease of 2-3 times<br>compared to the initial dataframe

In [23]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   CarNumber         930 non-null    category
 1   Refund            930 non-null    int8    
 2   Fines             930 non-null    float32 
 3   Make              930 non-null    category
 4   Model             919 non-null    category
 5   Year              930 non-null    int16   
 6   calculate_column  930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 67.2 KB


## memory clean

- using %reset_selective and the library gc clean the memory of your initial<br>dataframe only

In [24]:
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calculate_column
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,0.804424
E432XX77RUS,1,6500.0,Toyota,Camry,1995,3.258145
7184TT36RUS,1,2100.0,Ford,Focus,1984,1.058468
X582HE161RUS,2,2000.0,Ford,Focus,2015,0.496278
92918M178RUS,1,5700.0,Ford,Focus,2014,2.830189
...,...,...,...,...,...,...
C584EY154RUS,1,500.0,Ford,Focus,2014,0.248262
T395KX197RUS,2,15200.0,Ford,Focus,2016,3.769841
8441XX154RUS,1,500.0,Ford,Focus,1994,0.250752
C590EY154RUS,2,8594.6,Ford,Focus,1982,2.168163


In [25]:
%reset_selective -f df

In [26]:
df

NameError: name 'df' is not defined