In [18]:
import pandas as pd
import gc

### read the fines.csv that you saved in the previous exercise

In [19]:
fines = pd.read_csv('../ex04/fines.csv', engine = 'python',)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
925,A123BC456RUS,2.5,2200.0,Toyota,Corolla,2010
926,B234CD567RUS,0.5,1800.0,Honda,Civic,2015
927,O630MX750RUS,3.0,45000.0,Ford,Focus,2018
928,D456EF789RUS,1.5,3000.0,Chevrolet,Malibu,2020


### iterations: in all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

1) loop: write a function that iterates through the dataframe using for i in range(0, len(df)), iloc and append() to a list, assign the result of the function to a new column in the dataframe

In [20]:
%%timeit
def calculate_using_loop(df):
    result = []
    for i in range(len(df)):
        value = df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year']
        result.append(value)
    return result

result_loop = calculate_using_loop(fines)
fines['Calculated_Loop'] = result_loop


135 ms ± 7.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


2) do it using iterrows()

In [21]:
%%timeit
def calculate_using_iterrows(df):
    result = []
    for index, row in df.iterrows():
        value = row['Fines'] / row['Refund'] * row['Year']
        result.append(value)
    return result

result_iterrows = calculate_using_iterrows(fines)
fines['Calculated_Iterrows'] = result_iterrows

46.6 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


3) do it using apply() and lambda function

In [22]:
%%timeit
fines['Calculated_Apply'] = fines.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

9.09 ms ± 226 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


4) do it using Series objects from the dataframe

In [23]:
%%timeit
fines['Calculated_Series'] = fines['Fines'] / fines['Refund'] * fines['Year']

244 μs ± 19 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


5) do it as in the previous subtask but with the method .values

In [24]:
%%timeit
result_values = fines['Fines'].values / fines['Refund'].values * fines['Year'].values
fines['Calculated_Values'] = result_values

108 μs ± 2.2 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### indexing: measure the time using the magic command %%timeit in the cell

1) get a row for a specific CarNumber, for example, ’O136HO197RUS’

In [25]:
car_number = 'O630MX750RUS'

In [26]:
%%timeit
row = fines[fines['CarNumber'] == car_number]

445 μs ± 57.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


2) set the index in your dataframe with CarNumber

In [27]:
fines.reset_index(inplace=True)
fines.set_index('CarNumber', inplace=True)

3) again, get a row for the same CarNumber

In [28]:
%%timeit
row_indexed = fines.loc[car_number]

83.2 μs ± 2.97 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### downcasting:

1) run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [29]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to E567FG890RUS
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                930 non-null    int64  
 1   Refund               930 non-null    float64
 2   Fines                930 non-null    float64
 3   Make                 930 non-null    object 
 4   Model                919 non-null    object 
 5   Year                 930 non-null    int64  
 6   Calculated_Loop      930 non-null    float64
 7   Calculated_Iterrows  930 non-null    float64
 8   Calculated_Apply     930 non-null    float64
 9   Calculated_Series    930 non-null    float64
 10  Calculated_Values    930 non-null    float64
dtypes: float64(7), int64(2), object(2)
memory usage: 250.6 KB


2) make a copy() of your initial dataframe into another dataframe optimized

In [30]:
fines_optimized = fines.copy()
fines_optimized.head()

Unnamed: 0_level_0,index,Refund,Fines,Make,Model,Year,Calculated_Loop,Calculated_Iterrows,Calculated_Apply,Calculated_Series,Calculated_Values
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Y163O8161RUS,0,2.0,3200.0,Ford,Focus,1989,3182400.0,3182400.0,3182400.0,3182400.0,3182400.0
E432XX77RUS,1,1.0,6500.0,Toyota,Camry,1995,12967500.0,12967500.0,12967500.0,12967500.0,12967500.0
7184TT36RUS,2,1.0,2100.0,Ford,Focus,1984,4166400.0,4166400.0,4166400.0,4166400.0,4166400.0
X582HE161RUS,3,2.0,2000.0,Ford,Focus,2015,2015000.0,2015000.0,2015000.0,2015000.0,2015000.0
92918M178RUS,4,1.0,5700.0,Ford,Focus,2014,11479800.0,11479800.0,11479800.0,11479800.0,11479800.0


3)
- downcast from float64 to float32 for all the columns
- downcast from int64 to the smallest numerical dtype possible
- run df.info(memory_usage=’deep’) for new dataframe, pay attention to the Dtype and the memory usage

In [31]:
for col in fines_optimized.select_dtypes(include='float64').columns:
    fines_optimized[col] = pd.to_numeric(fines_optimized[col], downcast='float')
    
for col in fines_optimized.select_dtypes(include='integer').columns:
    fines_optimized[col] = pd.to_numeric(fines_optimized[col], downcast='unsigned')
fines_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to E567FG890RUS
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                930 non-null    uint16 
 1   Refund               930 non-null    float32
 2   Fines                930 non-null    float32
 3   Make                 930 non-null    object 
 4   Model                919 non-null    object 
 5   Year                 930 non-null    uint16 
 6   Calculated_Loop      930 non-null    float64
 7   Calculated_Iterrows  930 non-null    float64
 8   Calculated_Apply     930 non-null    float64
 9   Calculated_Series    930 non-null    float64
 10  Calculated_Values    930 non-null    float64
dtypes: float32(2), float64(5), object(2), uint16(2)
memory usage: 232.5 KB


### categories
1) change the object type columns to type category
2) check the memory usage, it probably has a decrease of 2-3 times compared to the initial df

In [32]:
for col in fines_optimized.select_dtypes(include='object').columns:
    fines_optimized[col] = fines_optimized[col].astype('category')
fines_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to E567FG890RUS
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   index                930 non-null    uint16  
 1   Refund               930 non-null    float32 
 2   Fines                930 non-null    float32 
 3   Make                 930 non-null    category
 4   Model                919 non-null    category
 5   Year                 930 non-null    uint16  
 6   Calculated_Loop      930 non-null    float64 
 7   Calculated_Iterrows  930 non-null    float64 
 8   Calculated_Apply     930 non-null    float64 
 9   Calculated_Series    930 non-null    float64 
 10  Calculated_Values    930 non-null    float64 
dtypes: category(2), float32(2), float64(5), uint16(2)
memory usage: 138.3 KB


### memory clean
1) using %reset_selective and the library gc clean the memory of your initial dataframe only

In [33]:
%reset_selective -f fines
gc.collect()

510

In [34]:
fines

NameError: name 'fines' is not defined