# Exercise 05. Pandas optimizations

#### - Turn-in directory: `ex05/`.
#### - Files to turn in: `optimizations.ipynb`.
#### - Allowed functions: `import pandas as pd`, `import gc`.

In [1]:
import pandas as pd
import gc

####
## 1. Read the `fines.csv` file that you saved in the previous exercise:

In [2]:
df = pd.read_csv('../data/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.000000,Ford,Focus,1989
1,E432XX77RUS,1,6500.000000,Toyota,Camry,1995
2,7184TT36RUS,1,2100.000000,Ford,Focus,1984
3,X582HE161RUS,2,2000.000000,Ford,Focus,2015
4,92918M178RUS,1,5700.000000,Ford,Focus,2014
...,...,...,...,...,...,...
925,,1,500.000000,Ford,Focus,2021
926,,2,15200.000000,Ford,Focus,2020
927,,1,500.000000,Ford,Focus,2024
928,,2,8594.586466,Ford,Focus,2020


####
## 2. Iterations: in all the following subtasks, you need to calculate `fines/refund*year` for each row. Create a new column with the calculated data. Measure the time using the magic command `%%timeit` in the cell.
   - Write a function that loops through the dataframe using `for i in range(0, len(df))`, `iloc`, and `append()` to a list. Assign the result of the function to a new column in the dataframe.
   - Do it using `iterrows()`.
   - Do it using `apply()` and a lambda function.
   - Do it using `Series` objects from the dataframe.
   - Do it as in the previous subtask, but use the method `.values`.

In [3]:
def calc_loop():
  calculated_values = []
  for i in range(0, len(df)):
    current_row = df.iloc[i]
    value = current_row['Fines'] / current_row['Refund'] * current_row['Year']
    calculated_values.append(value)
  return calculated_values

In [4]:
%%timeit
calc_loop()

75.2 ms ± 3.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
def iterrows_loop():
  calculated_values = []
  for _, current_row in df.iterrows():
    value = current_row['Fines'] / current_row['Refund'] * current_row['Year']
    calculated_values.append(value)
  return calculated_values

In [6]:
%%timeit
iterrows_loop()

63.6 ms ± 4.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
def apply_loop():
    return df.apply(lambda x: x['Fines'] / x['Refund'] * x['Year'], axis = 1)

In [8]:
%%timeit
apply_loop()

11.7 ms ± 312 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
def whole_columns_multiplication():
    return df['Fines'] / df['Refund'] * df['Year']

In [10]:
%%timeit
whole_columns_multiplication()

180 µs ± 4.07 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [11]:
def raw_arrays_multiplication():
    return df['Fines'].values / df['Refund'].values * df['Year'].values

In [12]:
%%timeit
raw_arrays_multiplication()

23.5 µs ± 69.8 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
calc_loop()

[3182400.0,
 12967500.0,
 4166400.0,
 2015000.0,
 11479800.0,
 5970000.0,
 17086037.894736815,
 2217600.0,
 8273800.0,
 17189172.9323308,
 5000000.0,
 3984000.0,
 8530127.067669159,
 146305000.0,
 403400.0,
 8585991.879699234,
 14895000.0,
 8654748.571428558,
 17060254.135338318,
 12375200.0,
 8659045.86466164,
 19869300.0,
 8650451.278195474,
 8560208.120300738,
 3796200.0,
 8573099.999999987,
 7348200.0,
 3000000.0,
 17240740.45112779,
 63553200.0,
 10541700.0,
 3193600.0,
 11880000.0,
 18262000.0,
 3970000.0,
 4433000.0,
 1587200.0,
 17120416.240601476,
 4824000.0,
 1005000.0,
 2788800.0,
 8677400.0,
 12330750.0,
 1006000.0,
 3610800.0,
 1408400.0,
 8616072.932330813,
 1995000.0,
 3964000.0,
 1987000.0,
 1006500.0,
 2982000.0,
 2019000.0,
 5025000.0,
 8611775.63909773,
 20852000.0,
 8514000.0,
 14335750.0,
 5220800.0,
 16550200.0,
 12719700.0,
 1985000.0,
 1998000.0,
 1615200.0,
 1006000.0,
 8525829.774436077,
 26182000.0,
 21930800.0,
 2295400.0,
 1007500.0,
 102615800.0,
 11886000

In [14]:
iterrows_loop()

[3182400.0,
 12967500.0,
 4166400.0,
 2015000.0,
 11479800.0,
 5970000.0,
 17086037.894736815,
 2217600.0,
 8273800.0,
 17189172.9323308,
 5000000.0,
 3984000.0,
 8530127.067669159,
 146305000.0,
 403400.0,
 8585991.879699234,
 14895000.0,
 8654748.571428558,
 17060254.135338318,
 12375200.0,
 8659045.86466164,
 19869300.0,
 8650451.278195474,
 8560208.120300738,
 3796200.0,
 8573099.999999987,
 7348200.0,
 3000000.0,
 17240740.45112779,
 63553200.0,
 10541700.0,
 3193600.0,
 11880000.0,
 18262000.0,
 3970000.0,
 4433000.0,
 1587200.0,
 17120416.240601476,
 4824000.0,
 1005000.0,
 2788800.0,
 8677400.0,
 12330750.0,
 1006000.0,
 3610800.0,
 1408400.0,
 8616072.932330813,
 1995000.0,
 3964000.0,
 1987000.0,
 1006500.0,
 2982000.0,
 2019000.0,
 5025000.0,
 8611775.63909773,
 20852000.0,
 8514000.0,
 14335750.0,
 5220800.0,
 16550200.0,
 12719700.0,
 1985000.0,
 1998000.0,
 1615200.0,
 1006000.0,
 8525829.774436077,
 26182000.0,
 21930800.0,
 2295400.0,
 1007500.0,
 102615800.0,
 11886000

In [15]:
apply_loop()

0      3.182400e+06
1      1.296750e+07
2      4.166400e+06
3      2.015000e+06
4      1.147980e+07
           ...     
925    1.010500e+06
926    1.535200e+07
927    1.012000e+06
928    8.680532e+06
929    3.232000e+06
Length: 930, dtype: float64

In [16]:
whole_columns_multiplication()

0      3.182400e+06
1      1.296750e+07
2      4.166400e+06
3      2.015000e+06
4      1.147980e+07
           ...     
925    1.010500e+06
926    1.535200e+07
927    1.012000e+06
928    8.680532e+06
929    3.232000e+06
Length: 930, dtype: float64

In [17]:
raw_arrays_multiplication()

array([3.18240000e+06, 1.29675000e+07, 4.16640000e+06, 2.01500000e+06,
       1.14798000e+07, 5.97000000e+06, 1.70860379e+07, 2.21760000e+06,
       8.27380000e+06, 1.71891729e+07, 5.00000000e+06, 3.98400000e+06,
       8.53012707e+06, 1.46305000e+08, 4.03400000e+05, 8.58599188e+06,
       1.48950000e+07, 8.65474857e+06, 1.70602541e+07, 1.23752000e+07,
       8.65904586e+06, 1.98693000e+07, 8.65045128e+06, 8.56020812e+06,
       3.79620000e+06, 8.57310000e+06, 7.34820000e+06, 3.00000000e+06,
       1.72407405e+07, 6.35532000e+07, 1.05417000e+07, 3.19360000e+06,
       1.18800000e+07, 1.82620000e+07, 3.97000000e+06, 4.43300000e+06,
       1.58720000e+06, 1.71204162e+07, 4.82400000e+06, 1.00500000e+06,
       2.78880000e+06, 8.67740000e+06, 1.23307500e+07, 1.00600000e+06,
       3.61080000e+06, 1.40840000e+06, 8.61607293e+06, 1.99500000e+06,
       3.96400000e+06, 1.98700000e+06, 1.00650000e+06, 2.98200000e+06,
       2.01900000e+06, 5.02500000e+06, 8.61177564e+06, 2.08520000e+07,
      

In [18]:
if(
    calc_loop() == iterrows_loop()
    and (pd.Series(calc_loop()) == apply_loop()).all()
    and (pd.Series(calc_loop()) == whole_columns_multiplication()).all()
    and (pd.Series(calc_loop()).values == (raw_arrays_multiplication())).all()
):
    df['Calculations'] = raw_arrays_multiplication()
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculations
0,Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3.182400e+06
1,E432XX77RUS,1,6500.000000,Toyota,Camry,1995,1.296750e+07
2,7184TT36RUS,1,2100.000000,Ford,Focus,1984,4.166400e+06
3,X582HE161RUS,2,2000.000000,Ford,Focus,2015,2.015000e+06
4,92918M178RUS,1,5700.000000,Ford,Focus,2014,1.147980e+07
...,...,...,...,...,...,...,...
925,,1,500.000000,Ford,Focus,2021,1.010500e+06
926,,2,15200.000000,Ford,Focus,2020,1.535200e+07
927,,1,500.000000,Ford,Focus,2024,1.012000e+06
928,,2,8594.586466,Ford,Focus,2020,8.680532e+06


####
## 3. Indexing: measure the time using the magic command `%%timeit` in the cell.
   - Get a row for a specific `CarNumber`, for example, "O136HO197RUS."
   - Set the index in your dataframe with `CarNumber`.
   - Again, get a row for the same `CarNumber`.

In [19]:
%%timeit
rows_for_car_number = df[df['CarNumber'] == 'O136HO197RUS'].iloc[0]

429 µs ± 3.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [20]:
rows_for_car_number = df[df['CarNumber'] == 'O136HO197RUS'].iloc[0]
rows_for_car_number

CarNumber       O136HO197RUS
Refund                     2
Fines                 7800.0
Make                  Toyota
Model                Corolla
Year                    1999
Calculations       7796100.0
Name: 715, dtype: object

In [21]:
df = df.set_index('CarNumber')
df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3.182400e+06
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,1.296750e+07
7184TT36RUS,1,2100.000000,Ford,Focus,1984,4.166400e+06
X582HE161RUS,2,2000.000000,Ford,Focus,2015,2.015000e+06
92918M178RUS,1,5700.000000,Ford,Focus,2014,1.147980e+07
...,...,...,...,...,...,...
,1,500.000000,Ford,Focus,2021,1.010500e+06
,2,15200.000000,Ford,Focus,2020,1.535200e+07
,1,500.000000,Ford,Focus,2024,1.012000e+06
,2,8594.586466,Ford,Focus,2020,8.680532e+06


In [22]:
%%timeit
rows_for_car_number = df.loc['O136HO197RUS'].iloc[0]

97.3 µs ± 462 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [23]:
rows_for_car_number = df.loc['O136HO197RUS'].iloc[0]
rows_for_car_number

2

####
## 4. Downcasting:
   - Run `df.info(memory_usage='deep')`, and pay attention to the Dtype and memory usage.
   - Make a `copy()` of your initial dataframe into another dataframe, `optimized_df`.
   - Downcast from `float64` to `float32` for all columns.
   - Downcast from `int64` to the smallest numerical Dtype possible.
   - Run `info(memory_usage='deep')` for your new dataframe. Pay attention to the Dtype and memory usage.

In [24]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to nan
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    int64  
 1   Fines         930 non-null    float64
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    int64  
 5   Calculations  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 235.8 KB


In [25]:
optimized_df = df.copy()
optimized_df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3.182400e+06
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,1.296750e+07
7184TT36RUS,1,2100.000000,Ford,Focus,1984,4.166400e+06
X582HE161RUS,2,2000.000000,Ford,Focus,2015,2.015000e+06
92918M178RUS,1,5700.000000,Ford,Focus,2014,1.147980e+07
...,...,...,...,...,...,...
,1,500.000000,Ford,Focus,2021,1.010500e+06
,2,15200.000000,Ford,Focus,2020,1.535200e+07
,1,500.000000,Ford,Focus,2024,1.012000e+06
,2,8594.586466,Ford,Focus,2020,8.680532e+06


In [26]:
# optimized_df[['Fines', 'Calculations']] = optimized_df[['Fines', 'Calculations']].apply(pd.to_numeric, downcast='float')
optimized_df['Fines'] = pd.to_numeric(optimized_df['Fines'], downcast='float')
optimized_df['Calculations'] = optimized_df['Calculations'].astype('float32')

In [27]:
# optimized_df[['Refund', 'Year']] = optimized_df[['Refund', 'Year']].apply(pd.to_numeric, downcast='integer')
optimized_df['Refund'] = pd.to_numeric(optimized_df['Refund'], downcast='unsigned')
optimized_df['Year'] = pd.to_numeric(optimized_df['Year'], downcast='unsigned')

In [28]:
optimized_df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.000000,Ford,Focus,1984,4166400.0
X582HE161RUS,2,2000.000000,Ford,Focus,2015,2015000.0
92918M178RUS,1,5700.000000,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
,1,500.000000,Ford,Focus,2021,1010500.0
,2,15200.000000,Ford,Focus,2020,15352000.0
,1,500.000000,Ford,Focus,2024,1012000.0
,2,8594.586914,Ford,Focus,2020,8680532.0


In [29]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to nan
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Refund        930 non-null    uint8  
 1   Fines         930 non-null    float32
 2   Make          930 non-null    object 
 3   Model         919 non-null    object 
 4   Year          930 non-null    uint16 
 5   Calculations  930 non-null    float32
dtypes: float32(2), object(2), uint16(1), uint8(1)
memory usage: 216.8 KB


####
## 5. Categories:
   - Change the `object` type columns to `category`.
   - This time, check the memory usage. It will probably decrease by 2–3 times compared to the initial dataframe.

In [30]:
optimized_df['Make'] = optimized_df['Make'].astype('category')
optimized_df['Model'] = optimized_df['Model'].astype('category')

In [31]:
optimized_df

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Calculations
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus,1989,3182400.0
E432XX77RUS,1,6500.000000,Toyota,Camry,1995,12967500.0
7184TT36RUS,1,2100.000000,Ford,Focus,1984,4166400.0
X582HE161RUS,2,2000.000000,Ford,Focus,2015,2015000.0
92918M178RUS,1,5700.000000,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...
,1,500.000000,Ford,Focus,2021,1010500.0
,2,15200.000000,Ford,Focus,2020,15352000.0
,1,500.000000,Ford,Focus,2024,1012000.0
,2,8594.586914,Ford,Focus,2020,8680532.0


In [32]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to nan
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Refund        930 non-null    uint8   
 1   Fines         930 non-null    float32 
 2   Make          930 non-null    category
 3   Model         919 non-null    category
 4   Year          930 non-null    uint16  
 5   Calculations  930 non-null    float32 
dtypes: category(2), float32(2), uint16(1), uint8(1)
memory usage: 108.0 KB


####
## 6. Memory clean:
   - Using the library `gc` and the command `%reset_selective`, clean the memory of your initial dataframe only.

In [36]:
%reset_selective -f df

In [37]:
gc.collect()

0

In [39]:
df

NameError: name 'df' is not defined