# 数据压缩

In [94]:
import pandas as pd
titanic = pd.read_csv("titanic_train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [95]:
titanic.shape

(891, 12)

In [96]:
titanic.info(memory_usage='deep')   # 数据的信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 315.0 KB


In [97]:
titanic.select_dtypes(include='object') # 根据数据类型筛选DataFrame的列

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [98]:
titanic.select_dtypes(include='object').memory_usage(deep=True)

Index         132
Name        74813
Sex         54979
Ticket      56802
Cabin       34344
Embarked    51626
dtype: int64

In [99]:
for dtype in ['float64', 'int64','object']:
    selected_dtype = titanic.select_dtypes(include=[dtype])
    mean_usage_B = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_MB = mean_usage_B/(1024**2)
    print(f'平均内存占用:{dtype}  {mean_usage_MB} MB')


平均内存占用:float64  0.004573822021484375 MB
平均内存占用:int64  0.0056858062744140625 MB
平均内存占用:object  0.04334386189778646 MB


In [100]:
import numpy as np
int_types = ['uint8','int8','int16','int32','int64']
for it in int_types:
    print(np.iinfo((it)))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------



In [101]:
titanic.select_dtypes(include='int64')

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch
0,1,0,3,1,0
1,2,1,1,1,0
2,3,1,3,0,0
3,4,1,1,1,0
4,5,0,3,0,0
...,...,...,...,...,...
886,887,0,2,0,0
887,888,1,1,0,0
888,889,0,3,1,2
889,890,1,1,0,0


### 自定义mem_usage函数

In [102]:
def mem_usage(pandas_obj):    # isinstance()用来判断一个对象是否是一个已知的类型
    if isinstance(pandas_obj,pd.DataFrame):
        usage_B = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_B = pandas_obj.memory_usage(deep=True)
    usage_MB =  usage_B/(1024**2)
    return '{:03.4f} MB'.format(usage_MB)
titanic_int = titanic.select_dtypes(include='int64')  # 取出所有int64类型的数据
'''
pd.to_numeric用于将 DataFrame 中的列转换为数值类型（如整数或浮点数）它会尝试将列中的值转换为数值，如果无法转换，则保留为 NaN。
downcast 是 pd.to_numeric 函数的参数，用于指定在转换后的数据中使用尽可能小的整数类型。
'unsigned' 表示使用无符号整数类型。无符号整数只能表示非负数，因此可以存储更大的正数。
'''
converted_int = titanic_int.apply(pd.to_numeric,downcast='unsigned') # 将int64数据压缩
print(mem_usage(titanic_int))
print(mem_usage(converted_int))
converted_int.select_dtypes(include='int')

0.0341 MB
0.0052 MB


0
1
2
3
4
...
886
887
888
889
890


In [103]:
titanic_float = titanic.select_dtypes(include='float64')  # 取出所有float64类型的数据
converted_float = titanic_float.apply(pd.to_numeric,downcast='float') # 将float64数据压缩
print(mem_usage(titanic_float))
print(mem_usage(converted_float))

0.0137 MB
0.0069 MB


In [104]:
# 打印压缩后的内存占用
optimized_titanic = titanic.copy()
optimized_titanic[converted_int.columns] = converted_int
optimized_titanic[converted_float.columns] = converted_float

print(mem_usage(titanic))
print(mem_usage(optimized_titanic))

0.3076 MB
0.2720 MB


In [105]:
titanic_obj = titanic.select_dtypes(include='object').copy()
titanic_obj.describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [106]:
# object类型中有很多相同的字符串  比如Sex列只有两种不同的字符串
dow = titanic_obj.Sex
dow_cat = dow.astype('category')  #转变为category类型
dow_cat.head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: category
Categories (2, object): ['female', 'male']

In [107]:
dow_cat.head(10).cat.codes  # 类型转变后的编码格式

0    1
1    0
2    0
3    0
4    1
5    1
6    1
7    1
8    0
9    0
dtype: int8

In [108]:
print(mem_usage(dow))
print(mem_usage(dow_cat))

0.0526 MB
0.0012 MB


In [109]:
converted_obj = pd.DataFrame()  
for col in titanic_obj.columns:  # titanic_obj为数据中object类型的部分   从中取出每一列
    num_unique_values = len(titanic_obj[col].unique())  # 这一列中不同字符串的个数
    num_total_values = len(titanic_obj[col])            # 这一列全部值的个数
    if num_unique_values/num_total_values < 0.5:        # 如果这一列不同的字符串个数相对较少
        converted_obj.loc[:,col] = titanic_obj[col].astype('category')  # 将这一列的数据类型转变为categories进行压缩
    else:
        converted_obj.loc[:,col] = titanic_obj[col]     # 否则保持不变       

In [110]:
print(mem_usage(titanic_obj))
print(mem_usage(converted_obj))

0.2601 MB
0.1420 MB
