In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def get_dataset(size):
    """Create fake dataset"""
    data = pd.DataFrame()
    data['model'] = np.random.choice(
    ['m1', 'm2', 'm3', 'm4'], size)
    data['fuel'] = np.random.choice(['petrol', 'diesel', 'gas'], size)
    data['production_date'] = np.random.randint(1990, 2024, size)
    data['transmission'] = np.random.choice(
    ['mechanical', 'automatic', 'robotic'], size)
    data['engine_power'] = np.random.randint(129, 609, size)
    data['price'] = np.random.uniform(60000., 12460000, size)
    data['count'] = np.random.randint(1, 30, size)
    return data

**Изучаем эффективное выделение памяти в Pandas**
1. int8: -128 to 127
2. int16: -32768 to 32767
3. int32: -2147483648 to 2147483647
4. int64: -9223372036854775808 to 9223372036854775807

In [4]:
SIZE = 1_000_000
df = get_dataset(SIZE)

In [5]:
df_start = df.copy()

In [6]:
df.head()

Unnamed: 0,model,fuel,production_date,transmission,engine_power,price,count
0,m4,gas,2004,robotic,603,3191154.0,12
1,m3,petrol,2022,mechanical,160,11532650.0,7
2,m3,diesel,1995,mechanical,303,11705550.0,5
3,m1,petrol,1992,automatic,482,10540920.0,11
4,m1,petrol,2011,robotic,537,2813183.0,14


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   model            1000000 non-null  object 
 1   fuel             1000000 non-null  object 
 2   production_date  1000000 non-null  int64  
 3   transmission     1000000 non-null  object 
 4   engine_power     1000000 non-null  int64  
 5   price            1000000 non-null  float64
 6   count            1000000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 53.4+ MB


In [8]:
df.memory_usage().sum()

np.int64(56000132)

**production_date**

In [9]:
print(df['production_date'].min(), df['production_date'].max())

1990 2023


In [10]:
print(df['engine_power'].min(), df['engine_power'].max())

129 608


In [11]:
df['production_date'] = df['production_date'].astype(np.int16)
df['engine_power'] = df['engine_power'].astype(np.int16)

In [12]:
df.memory_usage().sum()

np.int64(44000132)

**count**

In [13]:
print(df['count'].min(), df['count'].max())

1 29


In [14]:
df['count'] = df['count'].astype(np.int8)

**category**

In [15]:
cat_rols = df.select_dtypes(include='object').columns
df[cat_rols] = df[cat_rols].astype('category')

In [16]:
df.memory_usage().sum()

np.int64(16000600)

**result**

In [17]:
df_end = df.copy()

In [20]:
df_end.memory_usage().sum()

np.int64(16000600)

In [19]:
df_start.memory_usage().sum()

np.int64(56000132)

In [18]:
print(f"Memory usage decreased to {df_end.memory_usage().sum() / df_start.memory_usage().sum() * 100:.2f}%")

Memory usage decreased to 28.57%
