In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
# dtype_ = {'sid': str,
#           'package': str,
#           'version': str,
#           'android_id': str,
#           'media_id': str,
#           'apptype': str,
#           'location': str,
#           'cus_type': str,
#           'carrier': str,
#           'os': str,
#           'osv': str,
#           'lan': str}

dtype_ = {'carrier': str}

In [4]:
train = reduce_mem_usage(pd.read_csv('train.csv', dtype=dtype_, index_col=0))
test1 = reduce_mem_usage(pd.read_csv('test1.csv', dtype=dtype_,index_col=0))

Memory usage of dataframe is 84000000.00 MB
Memory usage after optimization is: 47500000.00 MB
Decreased by 43.5%
Memory usage of dataframe is 24000000.00 MB
Memory usage after optimization is: 14100000.00 MB
Decreased by 41.2%


In [5]:
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [6]:
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,46000.0,2196.0,2.0,1080.0,CN,639,2.0,Android,8.1.0,188,1440682,1559872000000.0,7,1672223856,57,3872258917,658
1,435108,944,46003.0,2280.0,3.0,1080.0,zh-CN,704,6.0,Android,8.1.0,221,1606824,1559739000000.0,3,3767901757,23,129322164,943
2,0,1106,46000.0,0.0,0.0,0.0,,39,2.0,android,5.1,1562,1774642,1559614000000.0,0,454638703,30,4226678391,411
3,451504,761,46000.0,1344.0,0.0,720.0,,54,2.0,android,7.1.1,9,1742535,1559668000000.0,0,1507622951,65,3355419572,848
4,0,1001,46000.0,665.0,0.0,320.0,zh-CN,29,5.0,Android,8.1.0,4,1689686,1559694000000.0,0,4116351093,148,2644467751,411


In [7]:
def timestamp_to_str(timestamp):
    return datetime.fromtimestamp(timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')

In [8]:
train['timestamp'] = train['timestamp'].apply(timestamp_to_str)
test1['timestamp'] = test1['timestamp'].apply(timestamp_to_str)

In [9]:
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,2019-06-07 15:32:01,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,2019-06-08 19:40:40,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,2019-06-06 23:59:13,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,2019-06-09 09:00:12,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,2019-06-07 08:28:13,5,2364522023,4,1510695983,582


In [10]:
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,46000.0,2196.0,2.0,1080.0,CN,639,2.0,Android,8.1.0,188,1440682,2019-06-07 09:42:30,7,1672223856,57,3872258917,658
1,435108,944,46003.0,2280.0,3.0,1080.0,zh-CN,704,6.0,Android,8.1.0,221,1606824,2019-06-05 20:53:56,3,3767901757,23,129322164,943
2,0,1106,46000.0,0.0,0.0,0.0,,39,2.0,android,5.1,1562,1774642,2019-06-04 10:07:42,0,454638703,30,4226678391,411
3,451504,761,46000.0,1344.0,0.0,720.0,,54,2.0,android,7.1.1,9,1742535,2019-06-05 01:03:22,0,1507622951,65,3355419572,848
4,0,1001,46000.0,665.0,0.0,320.0,zh-CN,29,5.0,Android,8.1.0,4,1689686,2019-06-05 08:15:54,0,4116351093,148,2644467751,411


In [11]:
# ID
id_feat = ['sid', 'android_id']
# 类别变量
cat_feat = ['package', 'version', 'media_id', 'apptype', 'location', 'cus_type',
            'ntt', 'carrier', 'osv', 'lan']
# 数值变量
numeric_feat = ['fea_hash', 'fea1_hash', 'dev_height', 'dev_width', 'dev_ppi']
# 时间变量
time_feat = ['timestamp']

In [12]:
def analysis(df, col):
    print(i)
    print('unique: \n', df[col].unique())
    print('value_counts: \n', df[col].value_counts())
    print('nunique: \n', df[col].nunique())
    print('-' * 20)
    print('\n')

In [13]:
for i in cat_feat:
    if i != 'package':
        analysis(train, i)
        print('-' * 10)
        analysis(test1, i)
        print('*' * 20)

version
unique: 
 ['8' '4' '0' '5' '9' '7' '3' '1' '6' '11' '2' 'v1' 'V3' 'GA3' '10'
 'P_Final_6' '15' 'V6' ' 2' 'GA2' 'V2' '50']
value_counts: 
 0            292156
5             56691
8             38348
4             24262
3             23857
7             22801
2             12750
1              9184
11             8867
6              8527
V3              806
9               765
v1              611
10              126
P_Final_6        92
V6               88
GA3              35
GA2              10
15                9
V2                8
 2                6
50                1
Name: version, dtype: int64
nunique: 
 22
--------------------


----------
version
unique: 
 ['7' '3' '0' '5' '1' '11' '2' '4' '6' '8' 'v1' 'V3' 'V6' '9' 'GA3' '10'
 '15' 'P_Final_6' 'GA2' 'V2' '50' ' 2' '20']
value_counts: 
 0            88113
5            16768
8            11410
4             7273
3             7160
7             6831
2             3733
1             2722
11            2711
6             24

 2.0    318597
6.0    116548
5.0     35021
0.0     20617
3.0      4463
4.0      4446
7.0       306
1.0         2
Name: ntt, dtype: int64
nunique: 
 8
--------------------


----------
ntt
unique: 
 [2. 6. 5. 0. 4. 3. 7.]
value_counts: 
 2.0    95538
6.0    34875
5.0    10521
0.0     6182
3.0     1466
4.0     1346
7.0       72
Name: ntt, dtype: int64
nunique: 
 7
--------------------


********************
carrier
unique: 
 ['46000.0' '0.0' '46003.0' '-1.0' '46001.0']
value_counts: 
 46000.0    359409
46001.0     43390
0.0         40652
46003.0     32294
-1.0        24255
Name: carrier, dtype: int64
nunique: 
 5
--------------------


----------
carrier
unique: 
 ['46000.0' '46003.0' '-1.0' '0.0' '46001.0']
value_counts: 
 46000.0    107691
46001.0     13083
0.0         12160
46003.0      9707
-1.0         7359
Name: carrier, dtype: int64
nunique: 
 5
--------------------


********************
osv
unique: 
 ['9' '8.1' '8.1.0' '8.0.0' '5.1' '9.0.0' '7.1.1' '5.1.1' '7.8.7' '4.4.4'
 'nan'

value_counts: 
 zh-CN          244474
nan            183280
zh              30574
cn              20685
zh_CN           11602
Zh-CN            8122
zh-cn             642
ZH                185
CN                168
tw                119
en                 89
zh_CN_#Hans        26
ko                 11
zh-TW               7
en-US               3
zh-HK               3
TW                  2
en-GB               2
ja                  2
it                  2
mi                  1
zh-MO               1
Name: lan, dtype: int64
nunique: 
 22
--------------------


----------
lan
unique: 
 ['CN' 'zh-CN' 'nan' 'cn' 'Zh-CN' 'zh_CN' 'zh' 'ZH' 'zh-cn' 'zh_CN_#Hans'
 'en' 'zh-TW' 'ko' 'tw' 'en-GB' 'TW' 'en-US' 'zh-HK' 'en_US' 'zh-US'
 'in_ID']
value_counts: 
 zh-CN          73046
nan            55547
zh              9094
cn              6136
zh_CN           3391
Zh-CN           2430
zh-cn            165
ZH                60
CN                46
tw                28
en                26
zh_CN_#Hans    

### os 删除

In [14]:
# os列取值唯一，删除
train.drop('os', axis=1, inplace=True)
test1.drop('os', axis=1, inplace=True)

### ntt, dev_height, dev_ppi, dev_width

In [15]:
train['ntt'].replace(0., np.nan, inplace=True)
test1['ntt'].replace(0., np.nan, inplace=True)

train['dev_height'].replace(0., np.nan, inplace=True)
test1['dev_height'].replace(0., np.nan, inplace=True)

train['dev_ppi'].replace(0., np.nan, inplace=True)
test1['dev_ppi'].replace(0., np.nan, inplace=True)

train['dev_width'].replace(0., np.nan, inplace=True)
test1['dev_width'].replace(0., np.nan, inplace=True)

### carrier 运营商

In [16]:
# 后续做独热编码
carrier_map = {'46000.0': 1,
               '0.0': np.nan,
               '46003.0': 2,
               '-1.0': np.nan,
               '46001.0': 3}

train['carrier'] = train['carrier'].map(carrier_map)
test1['carrier'] = test1['carrier'].map(carrier_map)

### lan 语言

In [17]:
lan_map = {'nan': np.nan,
           'zh-CN': 1,
           'zh': 1,
           'cn': 1,
           'zh-cn': 1,
           'zh_CN': 1,
           'Zh-CN': 1,
           'Zh-CN': 1,
           'ZH': 1,
           'en': 3,
           'CN': 1,
           'en-GB': 3,
           'tw': 2,
           'TW': 2,
           'zh-TW': 2,
           'zh_CN_#Hans': 10,
           'zh-HK': 4,
           'en-US': 5,
           'en-US': 5,
           'en_US': 5,
           'ko': 6,  # 韩国
           'zh-MO': 7,  # 澳门
           'it': 8,  # 意大利
           'mi': 9,
           'ja': 10,  # 日本
           'zh-US': np.nan,
           'in_ID': np.nan}

train['lan'] = train['lan'].map(lan_map)
test1['lan'] = test1['lan'].map(lan_map)

### version 版本

In [18]:
version_map = {'8': 8,
               '4': 4,
               '0': 0,
               '5': 5,
               '9': 9,
               '7': 7,
               '3': 3,
               '1': 1,
               '6': 6,
               '11': 11,
               '2': 2,
               'v1': 1,
               'V3': 3,
               'GA3': 3,
               '10': 10,
               'P_Final_6': 6,
               '15': 15,
               'V6': 6,
               ' 2': 2,
               'GA2': 2,
               'V2': 2,
               '50': 5,
               '20': 2}

train['version'] = train['version'].map(version_map)
test1['version'] = test1['version'].map(version_map)

### osv 处理

In [19]:
train['osv'].replace('f073b_changxiang_v01_b1b8_20180915', '1', inplace=True)
train['osv'].replace('%E6%B1%9F%E7%81%B5OS+5.0', '5', inplace=True)
# 7910, 21100, 21000, 7930, 71200, 7920
patt_7 = re.compile('7910|7930|71200|7920|71300')
train['osv'].replace(patt_7, '7', inplace=True)
patt_2 = re.compile('21100|21000')
train['osv'].replace(patt_2, '2', inplace=True)

# GIONEE_YNGA
test1['osv'].replace('GIONEE_YNGA', 'nan', inplace=True)
test1['osv'].replace('12.0', '11', inplace=True)

# 7910, 21100, 21000, 7930, 71200, 7920
test1['osv'].replace(patt_7, '7', inplace=True)
test1['osv'].replace(patt_2, '2', inplace=True)


def osv_process(s): 
    res = re.split('\.', s)[0]
    if s == 'nan':
        return res
    elif res in [str(i) for i in range(1, 13)]:
        return int(res)
    else:
        res2 = re.split('\_|\ ', res)[-1]
        if res2 not in [str(i) for i in range(1, 13)]:
            print(res2)
        return int(res2)

print('train')    
train['osv'] = train['osv'].apply(osv_process)
print('test')
test1['osv'] = test1['osv'].apply(osv_process)

train['osv'].replace('nan', np.nan, inplace=True)
test1['osv'].replace('nan', np.nan, inplace=True)

train
test


In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 19 columns):
android_id    500000 non-null int32
apptype       500000 non-null int16
carrier       435093 non-null float64
dev_height    392986 non-null float16
dev_ppi       116709 non-null float16
dev_width     392989 non-null float16
label         500000 non-null int8
lan           316720 non-null float64
media_id      500000 non-null int16
ntt           479383 non-null float16
osv           493439 non-null float64
package       500000 non-null int16
sid           500000 non-null int32
timestamp     500000 non-null object
version       500000 non-null int64
fea_hash      500000 non-null object
location      500000 non-null int16
fea1_hash     500000 non-null int64
cus_type      500000 non-null int16
dtypes: float16(4), float64(3), int16(5), int32(2), int64(2), int8(1), object(2)
memory usage: 43.4+ MB


In [21]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 18 columns):
android_id    150000 non-null int32
apptype       150000 non-null int16
carrier       130481 non-null float64
dev_height    117801 non-null float16
dev_ppi       34667 non-null float16
dev_width     117801 non-null float16
lan           94451 non-null float64
media_id      150000 non-null int16
ntt           143818 non-null float16
osv           148007 non-null float64
package       150000 non-null int16
sid           150000 non-null int32
timestamp     150000 non-null object
version       150000 non-null int64
fea_hash      150000 non-null object
location      150000 non-null int16
fea1_hash     150000 non-null int64
cus_type      150000 non-null int16
dtypes: float16(4), float64(3), int16(5), int32(2), int64(2), object(2)
memory usage: 12.9+ MB


### fea_hash 脏数据处理

In [22]:
train['fea_hash'] = train['fea_hash'].astype(str)
test1['fea_hash'] = test1['fea_hash'].astype(str)

In [23]:
train['fea_hash'] = train['fea_hash'].apply(lambda x: np.nan if ':' in x else int(x))
test1['fea_hash'] = test1['fea_hash'].apply(lambda x: np.nan if ':' in x else int(x))

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 19 columns):
android_id    500000 non-null int32
apptype       500000 non-null int16
carrier       435093 non-null float64
dev_height    392986 non-null float16
dev_ppi       116709 non-null float16
dev_width     392989 non-null float16
label         500000 non-null int8
lan           316720 non-null float64
media_id      500000 non-null int16
ntt           479383 non-null float16
osv           493439 non-null float64
package       500000 non-null int16
sid           500000 non-null int32
timestamp     500000 non-null object
version       500000 non-null int64
fea_hash      499910 non-null float64
location      500000 non-null int16
fea1_hash     500000 non-null int64
cus_type      500000 non-null int16
dtypes: float16(4), float64(4), int16(5), int32(2), int64(2), int8(1), object(1)
memory usage: 43.4+ MB


In [25]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 18 columns):
android_id    150000 non-null int32
apptype       150000 non-null int16
carrier       130481 non-null float64
dev_height    117801 non-null float16
dev_ppi       34667 non-null float16
dev_width     117801 non-null float16
lan           94451 non-null float64
media_id      150000 non-null int16
ntt           143818 non-null float16
osv           148007 non-null float64
package       150000 non-null int16
sid           150000 non-null int32
timestamp     150000 non-null object
version       150000 non-null int64
fea_hash      149977 non-null float64
location      150000 non-null int16
fea1_hash     150000 non-null int64
cus_type      150000 non-null int16
dtypes: float16(4), float64(4), int16(5), int32(2), int64(2), object(1)
memory usage: 12.9+ MB


### 导出成h5文件

In [26]:
train.to_hdf('train.h5', 'df')
test1.to_hdf('test1.h5', 'df')