In [1]:
import pandas as pd
import gc

## read the fines.csv

In [2]:
df = pd.read_csv('../data/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008
2,7184TT36RUS,1,2100.0,Ford,Focus,1994
3,X582HE161RUS,2,2000.0,Ford,Focus,1987
4,92918M178RUS,1,5700.0,Ford,Focus,2000
...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020
926,S22RUS,1,2300.0,Nissan,Altima,2018
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019
928,S24RUS,1,3500.0,Hyundai,Elantra,2021


## iterations
in all the following subtasks, you need to calculate `fines/refund*year` for
each row and create a new column with the calculated data and measure the time

### loop
write a function that iterates through the dataframe using `for i in range(0, len(df))`, `iloc` and `append()` to a list, assign the result of the function to a new column in the dataframe

In [3]:
def loop(df):
    res = []
    for i in range(0, len(df)):
        row = df.iloc[i]
        res.append(row['Fines'] / row['Refund'] * row['Year'])
    return res

In [4]:
%%timeit
df['Calculated'] = loop(df)

55.7 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
df['Calculated'] = loop(df)
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020,3030000.0
926,S22RUS,1,2300.0,Nissan,Altima,2018,4641400.0
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019,4038000.0
928,S24RUS,1,3500.0,Hyundai,Elantra,2021,7073500.0


### do it using `iterrows()`

In [6]:
%%timeit
df['Calculated'] = [row['Fines'] / row['Refund'] * row['Year'] for _, row in df.iterrows()]

51.9 ms ± 5.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
df['Calculated'] = [row['Fines'] / row['Refund'] * row['Year'] for _, row in df.iterrows()]
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020,3030000.0
926,S22RUS,1,2300.0,Nissan,Altima,2018,4641400.0
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019,4038000.0
928,S24RUS,1,3500.0,Hyundai,Elantra,2021,7073500.0


### do it using `apply()` and lambda function

In [8]:
%%timeit
df['Calculated'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

9.25 ms ± 547 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
df['Calculated'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020,3030000.0
926,S22RUS,1,2300.0,Nissan,Altima,2018,4641400.0
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019,4038000.0
928,S24RUS,1,3500.0,Hyundai,Elantra,2021,7073500.0


### do it using `Series` objects from the dataframe

In [10]:
%%timeit
df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']

259 μs ± 18.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020,3030000.0
926,S22RUS,1,2300.0,Nissan,Altima,2018,4641400.0
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019,4038000.0
928,S24RUS,1,3500.0,Hyundai,Elantra,2021,7073500.0


### do it as in the previous subtask but with the method `.values`

In [12]:
%%timeit
df['Calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values

116 μs ± 1.7 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
df['Calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2,3200.0,Ford,Focus,2018,3228800.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,2008,13052000.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1994,4187400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,1987,1987000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2000,11400000.0
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.0,Honda,Civic,2020,3030000.0
926,S22RUS,1,2300.0,Nissan,Altima,2018,4641400.0
927,S23RUS,2,4000.0,Chevrolet,Malibu,2019,4038000.0
928,S24RUS,1,3500.0,Hyundai,Elantra,2021,7073500.0


## indexing
measure the time using the magic command `%%timeit` in the cell

### get a row for a specific `CarNumber`, for example, `’O136HO197RUS’`

In [14]:
%%timeit
row = df[df['CarNumber'] == 'O136HO197RUS']

338 μs ± 7.93 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
row = df.loc[df['CarNumber'] == 'O136HO197RUS']
row

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
715,O136HO197RUS,2,7800.0,Toyota,Corolla,1990,7761000.0


### set the index in your dataframe with `CarNumber`

In [16]:
df_i = df.set_index('CarNumber')

### again, get a row for the same `CarNumber`

In [17]:
%%timeit
row = df_i.loc['O136HO197RUS']

83.6 μs ± 12.1 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [18]:
row = df_i.loc['O136HO197RUS']
row

Refund                2
Fines            7800.0
Make             Toyota
Model           Corolla
Year               1990
Calculated    7761000.0
Name: O136HO197RUS, dtype: object

## downcasting

### run `df.info(memory_usage=’deep’)`, pay attention to the `Dtype` and the memory usage

In [19]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    int64  
 2   Fines       930 non-null    float64
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int64  
 6   Calculated  930 non-null    float64
dtypes: float64(2), int64(2), object(3)
memory usage: 203.8 KB


### make a `copy()` of your initial dataframe into another dataframe `optimized`

In [20]:
optimized = df.copy()

### downcast from `float64` to `float32` for all the columns

In [21]:
cols = optimized.select_dtypes(include=['float64']).columns
optimized[cols] = optimized[cols].astype('float32')

### downcast from `int64` to the smallest numerical dtype possible

In [22]:
cols = optimized.select_dtypes(include=['int64']).columns
optimized[cols] = optimized[cols].apply(pd.to_numeric, downcast='integer')

### run `info(memory_usage='deep')` for your new dataframe, pay attention to the Dtype and the memory usage

In [23]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    int8   
 2   Fines       930 non-null    float32
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int16  
 6   Calculated  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(3)
memory usage: 184.7 KB


## categories

### change the `object` type columns to the type `category`

In [24]:
cols = optimized.select_dtypes(include=['object']).columns
optimized[cols] = optimized[cols].astype('category')

### This time, check the memory usage, it probably has a decrease of 2–3 times compared to the initial dataframe

In [25]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   CarNumber   930 non-null    category
 1   Refund      930 non-null    int8    
 2   Fines       930 non-null    float32 
 3   Make        930 non-null    category
 4   Model       918 non-null    category
 5   Year        930 non-null    int16   
 6   Calculated  930 non-null    float32 
dtypes: category(3), float32(2), int16(1), int8(1)
memory usage: 68.3 KB


## memory clean

In [26]:
%reset_selective -f df

In [27]:
gc.collect()

0