In [1]:
import numpy as np
import pandas as pd
import os
import gc

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
# dtype_ = {'sid': str,
#           'package': str,
#           'version': str,
#           'android_id': str,
#           'media_id': str,
#           'carrier': str,
#           'os': str,
#           'osv': str,
#           'lan': str}

In [4]:
train = reduce_mem_usage(pd.read_csv('train.csv', index_col=0))
test1 = reduce_mem_usage(pd.read_csv('test1.csv', index_col=0))

Memory usage of dataframe is 84000000.00 MB
Memory usage after optimization is: 42717672.00 MB
Decreased by 49.1%
Memory usage of dataframe is 24000000.00 MB
Memory usage after optimization is: 14881432.00 MB
Decreased by 38.0%


In [5]:
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46016.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46016.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46016.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [6]:
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,46016.0,2196.0,2.0,1080.0,CN,639,2.0,Android,8.1.0,188,1440682,1559872000000.0,7,1672223856,57,3872258917,658
1,435108,944,46016.0,2280.0,3.0,1080.0,zh-CN,704,6.0,Android,8.1.0,221,1606824,1559739000000.0,3,3767901757,23,129322164,943
2,0,1106,46016.0,0.0,0.0,0.0,,39,2.0,android,5.1,1562,1774642,1559614000000.0,0,454638703,30,4226678391,411
3,451504,761,46016.0,1344.0,0.0,720.0,,54,2.0,android,7.1.1,9,1742535,1559668000000.0,0,1507622951,65,3355419572,848
4,0,1001,46016.0,665.0,0.0,320.0,zh-CN,29,5.0,Android,8.1.0,4,1689686,1559694000000.0,0,4116351093,148,2644467751,411


In [7]:
for i in train.columns:
    print(i)
    print(train[i].dtype)
    print('*' * 20)

android_id
int32
********************
apptype
int16
********************
carrier
float16
********************
dev_height
float16
********************
dev_ppi
float16
********************
dev_width
float16
********************
label
int8
********************
lan
category
********************
media_id
int16
********************
ntt
float16
********************
os
category
********************
osv
category
********************
package
int16
********************
sid
int32
********************
timestamp
float32
********************
version
category
********************
fea_hash
category
********************
location
int16
********************
fea1_hash
int64
********************
cus_type
int16
********************
