In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
dtype_ = {'sid': str,
          'package': str,
          'version': str,
          'android_id': str,
          'media_id': str,
          'carrier': str,
          'os': str,
          'osv': str,
          'lan': str}

In [4]:
train = reduce_mem_usage(pd.read_csv('train.csv', dtype=dtype_, index_col=0))
test1 = reduce_mem_usage(pd.read_csv('test1.csv', dtype=dtype_, index_col=0))

Memory usage of dataframe is 84000000.00 MB
Memory usage after optimization is: 57500000.00 MB
Decreased by 31.5%
Memory usage of dataframe is 24000000.00 MB
Memory usage after optimization is: 17100000.00 MB
Decreased by 28.8%


In [5]:
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [6]:
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,46000.0,2196.0,2.0,1080.0,CN,639,2.0,Android,8.1.0,188,1440682,1559872000000.0,7,1672223856,57,3872258917,658
1,435108,944,46003.0,2280.0,3.0,1080.0,zh-CN,704,6.0,Android,8.1.0,221,1606824,1559739000000.0,3,3767901757,23,129322164,943
2,0,1106,46000.0,0.0,0.0,0.0,,39,2.0,android,5.1,1562,1774642,1559614000000.0,0,454638703,30,4226678391,411
3,451504,761,46000.0,1344.0,0.0,720.0,,54,2.0,android,7.1.1,9,1742535,1559668000000.0,0,1507622951,65,3355419572,848
4,0,1001,46000.0,665.0,0.0,320.0,zh-CN,29,5.0,Android,8.1.0,4,1689686,1559694000000.0,0,4116351093,148,2644467751,411


In [7]:
def timestamp_to_str(timestamp):
    return datetime.fromtimestamp(timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')

In [8]:
train['timestamp'] = train['timestamp'].apply(timestamp_to_str)
test1['timestamp'] = test1['timestamp'].apply(timestamp_to_str)

In [9]:
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,2019-06-07 15:32:01,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,2019-06-08 19:40:40,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,2019-06-06 23:59:13,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,2019-06-09 09:00:12,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,2019-06-07 08:28:13,5,2364522023,4,1510695983,582


In [10]:
test1.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,317625,1181,46000.0,2196.0,2.0,1080.0,CN,639,2.0,Android,8.1.0,188,1440682,2019-06-07 09:42:30,7,1672223856,57,3872258917,658
1,435108,944,46003.0,2280.0,3.0,1080.0,zh-CN,704,6.0,Android,8.1.0,221,1606824,2019-06-05 20:53:56,3,3767901757,23,129322164,943
2,0,1106,46000.0,0.0,0.0,0.0,,39,2.0,android,5.1,1562,1774642,2019-06-04 10:07:42,0,454638703,30,4226678391,411
3,451504,761,46000.0,1344.0,0.0,720.0,,54,2.0,android,7.1.1,9,1742535,2019-06-05 01:03:22,0,1507622951,65,3355419572,848
4,0,1001,46000.0,665.0,0.0,320.0,zh-CN,29,5.0,Android,8.1.0,4,1689686,2019-06-05 08:15:54,0,4116351093,148,2644467751,411


In [11]:
for i in train.columns:
    print(i)
    print(train[i].dtype)
    print('*' * 20)

android_id
object
********************
apptype
int16
********************
carrier
object
********************
dev_height
float16
********************
dev_ppi
float16
********************
dev_width
float16
********************
label
int8
********************
lan
object
********************
media_id
object
********************
ntt
float16
********************
os
object
********************
osv
object
********************
package
object
********************
sid
object
********************
timestamp
object
********************
version
object
********************
fea_hash
object
********************
location
int16
********************
fea1_hash
int64
********************
cus_type
int16
********************


In [12]:
def analysis(df, col):
    print(i)
    print('unique: \n', df[col].unique())
    print('value_counts: \n', df[col].value_counts())
    print('nunique: \n', df[col].nunique())
    print('-' * 20)
    print('\n')

In [13]:
for i in train.columns:
    if i not in ['label', 'sid', 'android_id', 'media_id', 'timestamp', 'dev_height', 'dev_width', 'dev_ppi', 'fea_hash', 'fea1_hash']:
        analysis(train, i)

apptype
unique: 
 [1199  893  821 1004 1076  788 1106 1001  761  869 1223  917  941  938
 1052 1055  845  758 1211  929  884  740  956  719  908 1028  806 1007
  764 1034 1172 1082  743  980 1064  746 1193  716  767 1010  947 1139
 1067 1235  779  722  734  944 1088  974 1124 1238 1169  704  923 1043
  959 1130  899  851  827   95  776  713  833 1181 1019  782  803 1136
  794 1232 1229  905 1115 1241  854  728 1112  989 1190 1031 1085  911
 1178 1100 1046 1097  857]
value_counts: 
 1001    101248
1106     46803
1076     40183
761      39426
1052     36588
917      29210
893      27155
1199     23212
1055     20587
1004     16721
941      13838
758      11543
1223     10358
938       7058
788       6755
845       6457
764       5135
956       4972
1193      4859
929       4298
1028      4228
719       3237
947       2835
734       2744
1172      2599
1238      2350
716       2232
1088      2176
1082      2071
1064      1737
740       1705
704       1557
821       1504
1007      1488
106

value_counts: 
 0       131906
4        90513
9        37005
18       22367
14       22139
24       14636
16       11098
5        10302
1         7058
2         6876
17        6535
35        5927
33        5882
26        5158
62        4972
38        4907
27        4839
68        4027
25        3619
61        3528
7         3439
52        2835
42        2740
78        2562
101       2516
6         2135
71        2040
54        1690
11        1641
39        1594
81        1534
50        1488
43        1478
29        1433
8         1415
76        1412
19        1392
90        1370
53        1287
44        1264
88        1251
21        1230
28        1201
10        1113
3         1088
22        1014
55        1002
51         971
163        916
30         914
84         902
23         895
72         895
65         865
41         784
108        778
48         761
111        749
20         705
69         691
96         663
15         653
128        652
87         651
170        645
141      

nunique: 
 1950
--------------------


version
unique: 
 ['8' '4' '0' '5' '9' '7' '3' '1' '6' '11' '2' 'v1' 'V3' 'GA3' '10'
 'P_Final_6' '15' 'V6' ' 2' 'GA2' 'V2' '50']
value_counts: 
 0            292156
5             56691
8             38348
4             24262
3             23857
7             22801
2             12750
1              9184
11             8867
6              8527
V3              806
9               765
v1              611
10              126
P_Final_6        92
V6               88
GA3              35
GA2              10
15                9
V2                8
 2                6
50                1
Name: version, dtype: int64
nunique: 
 22
--------------------


location
unique: 
 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  -1  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70

In [14]:
for i in test1.columns:
    if i not in ['label', 'sid', 'android_id', 'media_id', 'timestamp', 'dev_height', 'dev_width', 'dev_ppi', 'fea_hash', 'fea1_hash']:
        analysis(test1, i)

apptype
unique: 
 [1181  944 1106  761 1001 1052 1076  941  980  716  734  788 1223 1004
  938  947  758  845  893  917 1199 1055 1028  956 1007  740 1193  764
  704 1064 1211 1067 1019 1172  929 1238 1088  743 1082  899  722  908
  719  833  974 1010 1034  779 1169  821  767  851  803 1043 1232 1130
  806  869 1235 1139   95  959  884  782  827  794  923  776 1241  746
  713 1124 1136  905  854 1112 1178 1229]
value_counts: 
 1001    30377
1106    14274
761     12094
1076    11963
1052    10719
917      8642
893      8181
1199     6889
1055     6047
1004     4977
941      4151
758      3480
1223     3150
938      2138
788      1995
845      1972
764      1588
956      1467
1193     1400
1028     1308
929      1257
719      1037
734       887
947       803
1172      755
1238      667
1088      664
716       619
1082      616
1064      522
740       501
1007      481
821       472
704       420
1067      391
944       337
779       283
908       257
743       188
1211      145
851      

In [15]:
# os列取值唯一，删除
train.drop('os', axis=1, inplace=True)
test1.drop('os', axis=1, inplace=True)

In [16]:
train['ntt'].replace(0., np.nan, inplace=True)
test1['ntt'].replace(0., np.nan, inplace=True)

train['dev_height'].replace(0., np.nan, inplace=True)
test1['dev_height'].replace(0., np.nan, inplace=True)

train['dev_ppi'].replace(0., np.nan, inplace=True)
test1['dev_ppi'].replace(0., np.nan, inplace=True)

train['dev_width'].replace(0., np.nan, inplace=True)
test1['dev_width'].replace(0., np.nan, inplace=True)

In [17]:
carrier_map = {'46000.0': 1,
               '0.0': np.nan,
               '46003.0': 3,
               '-1.0': np.nan,
               '46001.0': 2}

train['carrier'] = train['carrier'].map(carrier_map)
test1['carrier'] = test1['carrier'].map(carrier_map)

In [18]:
 lan_map = {'nan': np.nan,
           'zh-CN': 1,
           'zh': 1,
           'cn': 1,
           'zh-cn': 1,
           'zh_CN': 1,
           'Zh-CN': 1,
           'Zh-CN': 1,
           'ZH': 1,
           'en': 3,
           'CN': 1,
           'en-GB': 3,
           'tw': 2,
           'TW': 2,
           'zh-TW': 2,
           'zh_CN_#Hans': 10,
           'zh-HK': 4,
           'en-US': 5,
           'en-US': 5,
           'en_US': 5,
           'ko': 6,  # 韩国
           'zh-MO': 7,  # 澳门
           'it': 8,  # 意大利
           'mi': 9,
           'ja': 10,  # 日本
           'zh-US': np.nan,
           'in_ID': np.nan}

train['lan'] = train['lan'].map(lan_map)
test1['lan'] = test1['lan'].map(lan_map)

In [19]:
version_map = {'8': 8,
               '4': 4,
               '0': 0,
               '5': 5,
               '9': 9,
               '7': 7,
               '3': 3,
               '1': 1,
               '6': 6,
               '11': 11,
               '2': 2,
               'v1': 1,
               'V3': 3,
               'GA3': 3,
               '10': 10,
               'P_Final_6': 6,
               '15': 15,
               'V6': 6,
               ' 2': 2,
               'GA2': 2,
               'V2': 2,
               '50': 5,
               '20': 2}

train['version'] = train['version'].map(version_map)
test1['version'] = test1['version'].map(version_map)

In [20]:
train['osv'].replace('f073b_changxiang_v01_b1b8_20180915', '1', inplace=True)
train['osv'].replace('%E6%B1%9F%E7%81%B5OS+5.0', '5', inplace=True)
# 7910, 21100, 21000, 7930, 71200, 7920
patt_7 = re.compile('7910|7930|71200|7920|71300')
train['osv'].replace(patt_7, '7', inplace=True)
patt_2 = re.compile('21100|21000')
train['osv'].replace(patt_2, '2', inplace=True)

# GIONEE_YNGA
test1['osv'].replace('GIONEE_YNGA', 'nan', inplace=True)
test1['osv'].replace('12.0', '11', inplace=True)

# 7910, 21100, 21000, 7930, 71200, 7920
test1['osv'].replace(patt_7, '7', inplace=True)
test1['osv'].replace(patt_2, '2', inplace=True)


def osv_process(s): 
    res = re.split('\.', s)[0]
    if s == 'nan':
        return res
    elif res in [str(i) for i in range(1, 13)]:
        return int(res)
    else:
        res2 = re.split('\_|\ ', res)[-1]
        if res2 not in [str(i) for i in range(1, 13)]:
            print(res2)
        return int(res2)

print('train')    
train['osv'] = train['osv'].apply(osv_process)
print('test')
test1['osv'] = test1['osv'].apply(osv_process)

train['osv'].replace('nan', np.nan, inplace=True)
test1['osv'].replace('nan', np.nan, inplace=True)

train
test


In [21]:
for i in train.columns:
    if i not in ['label', 'sid', 'android_id', 'media_id', 'timestamp', 'dev_height', 'dev_width', 'dev_ppi', 'fea_hash', 'fea1_hash']:
        analysis(train, i)

apptype
unique: 
 [1199  893  821 1004 1076  788 1106 1001  761  869 1223  917  941  938
 1052 1055  845  758 1211  929  884  740  956  719  908 1028  806 1007
  764 1034 1172 1082  743  980 1064  746 1193  716  767 1010  947 1139
 1067 1235  779  722  734  944 1088  974 1124 1238 1169  704  923 1043
  959 1130  899  851  827   95  776  713  833 1181 1019  782  803 1136
  794 1232 1229  905 1115 1241  854  728 1112  989 1190 1031 1085  911
 1178 1100 1046 1097  857]
value_counts: 
 1001    101248
1106     46803
1076     40183
761      39426
1052     36588
917      29210
893      27155
1199     23212
1055     20587
1004     16721
941      13838
758      11543
1223     10358
938       7058
788       6755
845       6457
764       5135
956       4972
1193      4859
929       4298
1028      4228
719       3237
947       2835
734       2744
1172      2599
1238      2350
716       2232
1088      2176
1082      2071
1064      1737
740       1705
704       1557
821       1504
1007      1488
106

unique: 
 [ 8  4  0  5  9  7  3  1  6 11  2 10 15]
value_counts: 
 0     292156
5      56692
8      38348
3      24698
4      24262
7      22801
2      12774
1       9795
11      8867
6       8707
9        765
10       126
15         9
Name: version, dtype: int64
nunique: 
 13
--------------------


location
unique: 
 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  -1  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
 161 162 163 164 165 16

In [22]:
for i in test1.columns:
    if i not in ['label', 'sid', 'android_id', 'media_id', 'timestamp', 'dev_height', 'dev_width', 'dev_ppi', 'fea_hash', 'fea1_hash']:
        analysis(test1, i)

apptype
unique: 
 [1181  944 1106  761 1001 1052 1076  941  980  716  734  788 1223 1004
  938  947  758  845  893  917 1199 1055 1028  956 1007  740 1193  764
  704 1064 1211 1067 1019 1172  929 1238 1088  743 1082  899  722  908
  719  833  974 1010 1034  779 1169  821  767  851  803 1043 1232 1130
  806  869 1235 1139   95  959  884  782  827  794  923  776 1241  746
  713 1124 1136  905  854 1112 1178 1229]
value_counts: 
 1001    30377
1106    14274
761     12094
1076    11963
1052    10719
917      8642
893      8181
1199     6889
1055     6047
1004     4977
941      4151
758      3480
1223     3150
938      2138
788      1995
845      1972
764      1588
956      1467
1193     1400
1028     1308
929      1257
719      1037
734       887
947       803
1172      755
1238      667
1088      664
716       619
1082      616
1064      522
740       501
1007      481
821       472
704       420
1067      391
944       337
779       283
908       257
743       188
1211      145
851      

In [23]:
train.to_hdf('train.h5', 'df')
test1.to_hdf('test1.h5', 'df')