In [1]:
import numpy as np
import pandas as pd

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

## user表

In [3]:
train_user = reduce_mem_usage(pd.read_csv('./train/train_user.csv'))
test_user = reduce_mem_usage(pd.read_csv('./test2/test_user.csv'))

Memory usage of dataframe is 635152.00 MB
Memory usage after optimization is: 256580.00 MB
Decreased by 59.6%
Memory usage of dataframe is 58128.00 MB
Memory usage after optimization is: 47978.00 MB
Decreased by 17.5%


In [4]:
train_user.to_hdf('./train/train_user.h5', 'df')
test_user.to_hdf('./test2/test_user.h5', 'df')

## voc表

In [5]:
train_voc = reduce_mem_usage(pd.read_csv('./train/train_voc.csv'))
test_voc = reduce_mem_usage(pd.read_csv('./test2/test_voc.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


Memory usage of dataframe is 320987648.00 MB
Memory usage after optimization is: 255787058.00 MB
Decreased by 20.3%
Memory usage of dataframe is 9477696.00 MB
Memory usage after optimization is: 7552565.00 MB
Decreased by 20.3%


In [6]:
train_voc.to_hdf('./train/train_voc.h5', 'df')
test_voc.to_hdf('./test2/test_voc.h5', 'df')

## sms表

In [7]:
train_sms = reduce_mem_usage(pd.read_csv('./train/train_sms.csv'))
test_sms = reduce_mem_usage(pd.read_csv('./test2/test_sms.csv'))

Memory usage of dataframe is 219152416.00 MB
Memory usage after optimization is: 171212853.00 MB
Decreased by 21.9%
Memory usage of dataframe is 4308128.00 MB
Memory usage after optimization is: 3365753.00 MB
Decreased by 21.9%


In [8]:
train_sms.to_hdf('./train/train_sms.h5', 'df')
test_sms.to_hdf('./test2/test_sms.h5', 'df')

## app表

In [9]:
train_app = reduce_mem_usage(pd.read_csv('./train/train_app.csv'))
test_app = reduce_mem_usage(pd.read_csv('./test2/test_app.csv'))

Memory usage of dataframe is 105075392.00 MB
Memory usage after optimization is: 91940984.00 MB
Decreased by 12.5%
Memory usage of dataframe is 3183008.00 MB
Memory usage after optimization is: 2387288.00 MB
Decreased by 25.0%


In [10]:
train_app.to_hdf('./train/train_app.h5', 'df')
test_app.to_hdf('./test2/test_app.h5', 'df')