In [1]:
import pandas as pd
import numpy as np

In [2]:
fuel = pd.read_csv('../../data_sets/fuel.csv', 
                   usecols=[
                       'vehicle_id',
                       'year',
                       'make',
                       'model',
                       'class',
                       'fuel_type',
                       'combined_mpg_ft1'
                   ],
                   low_memory=False)
fuel.head()

Unnamed: 0,vehicle_id,year,make,model,class,fuel_type,combined_mpg_ft1
0,26587,1984,Alfa Romeo,GT V6 2.5,Minicompact Cars,Regular,20
1,27705,1984,Alfa Romeo,GT V6 2.5,Minicompact Cars,Regular,20
2,26561,1984,Alfa Romeo,Spider Veloce 2000,Two Seaters,Regular,21
3,27681,1984,Alfa Romeo,Spider Veloce 2000,Two Seaters,Regular,21
4,27550,1984,AM General,DJ Po Vehicle 2WD,Special Purpose Vehicle 2WD,Regular,17


### info about all columns

In [3]:
fuel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   vehicle_id        38113 non-null  int64 
 1   year              38113 non-null  int64 
 2   make              38113 non-null  object
 3   model             38113 non-null  object
 4   class             38113 non-null  object
 5   fuel_type         38113 non-null  object
 6   combined_mpg_ft1  38113 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 2.0+ MB


### precise info about memory usage

In [4]:
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   vehicle_id        38113 non-null  int64 
 1   year              38113 non-null  int64 
 2   make              38113 non-null  object
 3   model             38113 non-null  object
 4   class             38113 non-null  object
 5   fuel_type         38113 non-null  object
 6   combined_mpg_ft1  38113 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 10.7 MB


### convert column to 'int32' type

In [11]:
fuel['year'] = fuel['year'].astype('int32')

In [12]:
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   vehicle_id        38113 non-null  int64 
 1   year              38113 non-null  int32 
 2   make              38113 non-null  object
 3   model             38113 non-null  object
 4   class             38113 non-null  object
 5   fuel_type         38113 non-null  object
 6   combined_mpg_ft1  38113 non-null  int64 
dtypes: int32(1), int64(2), object(4)
memory usage: 10.6 MB


### checking  how many times the values in the column are repeated

In [14]:
fuel['make'].value_counts().head()

Chevrolet    3810
Ford         3155
Dodge        2531
GMC          2398
Toyota       1937
Name: make, dtype: int64

### convert column to 'category' type

In [16]:
fuel['make'] = fuel['make'].astype('category')
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   vehicle_id        38113 non-null  int64   
 1   year              38113 non-null  int32   
 2   make              38113 non-null  category
 3   model             38113 non-null  object  
 4   class             38113 non-null  object  
 5   fuel_type         38113 non-null  object  
 6   combined_mpg_ft1  38113 non-null  int64   
dtypes: category(1), int32(1), int64(2), object(3)
memory usage: 8.4 MB


In [17]:
fuel['model'].value_counts().head()

F150 Pickup 2WD    209
Truck 2WD          187
F150 Pickup 4WD    187
Mustang            184
Jetta              172
Name: model, dtype: int64

In [19]:
fuel['model'] = fuel['model'].astype('category')
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   vehicle_id        38113 non-null  int64   
 1   year              38113 non-null  int32   
 2   make              38113 non-null  category
 3   model             38113 non-null  category
 4   class             38113 non-null  object  
 5   fuel_type         38113 non-null  object  
 6   combined_mpg_ft1  38113 non-null  int64   
dtypes: category(2), int32(1), int64(2), object(2)
memory usage: 6.3 MB


In [20]:
fuel['class'].value_counts().head()

Compact Cars                   5508
Subcompact Cars                4872
Midsize Cars                   4395
Standard Pickup Trucks         2354
Sport Utility Vehicle - 4WD    2082
Name: class, dtype: int64

In [21]:
fuel['class'] = fuel['class'].astype('category')
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   vehicle_id        38113 non-null  int64   
 1   year              38113 non-null  int32   
 2   make              38113 non-null  category
 3   model             38113 non-null  category
 4   class             38113 non-null  category
 5   fuel_type         38113 non-null  object  
 6   combined_mpg_ft1  38113 non-null  int64   
dtypes: category(3), int32(1), int64(2), object(1)
memory usage: 3.6 MB


In [22]:
fuel['fuel_type'].value_counts().head()

Regular            25258
Premium            10133
Gasoline or E85     1223
Diesel              1014
Electricity          133
Name: fuel_type, dtype: int64

In [23]:
fuel['fuel_type'] = fuel['fuel_type'].astype('category')
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   vehicle_id        38113 non-null  int64   
 1   year              38113 non-null  int32   
 2   make              38113 non-null  category
 3   model             38113 non-null  category
 4   class             38113 non-null  category
 5   fuel_type         38113 non-null  category
 6   combined_mpg_ft1  38113 non-null  int64   
dtypes: category(4), int32(1), int64(2)
memory usage: 1.3 MB


In [24]:
fuel['combined_mpg_ft1'] = fuel['combined_mpg_ft1'].astype('float')

In [25]:
fuel.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38113 entries, 0 to 38112
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   vehicle_id        38113 non-null  int64   
 1   year              38113 non-null  int32   
 2   make              38113 non-null  category
 3   model             38113 non-null  category
 4   class             38113 non-null  category
 5   fuel_type         38113 non-null  category
 6   combined_mpg_ft1  38113 non-null  float64 
dtypes: category(4), float64(1), int32(1), int64(1)
memory usage: 1.3 MB


### Comment: Initial size was 10.7MB, after optimization is 1.3MB. 
**When data has a lot of repeated values is worth to convert it to 'category' type. It saves a lot of memory**