In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import *
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from gensim.models import FastText, Word2Vec
import re
import random as rn
import gc
import logging
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(1017)
path="data/"
os.listdir("data/")

['user_app_actived.csv',
 'age_train.csv',
 'age_test.csv',
 'app_info.csv',
 'train_data.csv',
 'user_basic_info.csv',
 'user_app_usage.csv',
 'user_behavior_info.csv']

In [2]:
# 读入数据（需加速）
def get_age_data():
    train_data = pd.read_csv(path + 'age_train.csv', header=None)
    test_data = pd.read_csv(path + 'age_test.csv', header=None)
    data = pd.concat([train_data, test_data], axis=0, sort=False).fillna(-1)
    data.columns = ['uId', 'age_group']
    return data

def get_user_app_actived():
    data = pd.read_csv(path + 'user_app_actived.csv', header=None)
    data.columns = ['uId', 'appId']
    return data

def get_user_behavior_info():
    data = pd.read_csv(path + 'user_behavior_info.csv', header=None)
    data.columns = ['uId', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes',
                   'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']
    return data

def get_user_basic_info():
    data = pd.read_csv(path + 'user_basic_info.csv', header=None)
    data.columns = ['uId', 'gender', 'city', 'prodName', 'ramCapacity', 
                   'ramLeftRation', 'romCapacity', 'romLeftRation', 'color',
                   'fontSize', 'ct', 'carrier', 'os']
    return data

def get_app_info():
    data = pd.read_csv(path + 'app_info.csv', header=None)
    data.columns = ['appId', 'category']
    return data

# 测试的时候用True
# 提特征改用False
def get_user_app_usage(less_data=False):
    if less_data:
        reader = pd.read_csv(path + 'user_app_usage.csv', chunksize=2000000)
        for i in reader:
            data = i
            break
    else:
        data = pd.read_csv(path + 'user_app_usage.csv', header=None)
    data.columns = ['uId', 'appId', 'duration', 'times', 'use_date']
    return data

In [3]:
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# 行为特征
data = get_user_behavior_info()
data = pd.merge(id_label_data, data, on='uId', how='left')
import warnings
warnings.filterwarnings("ignore")
feature = pd.DataFrame()
for i in data.columns:
    if i not in ['age_group', 'uId']:
        feature[i] = data[i]
feature.to_csv('feature/f1.csv', index=False)

In [4]:
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# 用户基础特征
data = get_user_basic_info()
data = pd.merge(id_label_data, data, on='uId', how='left')
import warnings
warnings.filterwarnings("ignore")
feature = data[['gender', 'ramCapacity', 'ramLeftRation', 'romCapacity', 'romLeftRation', 'fontSize', 'os']]
feature['city'] = data['city'].fillna(-1).progress_apply(lambda row:int(str(row).split('c')[-1]))
feature['prodName'] = data['prodName'].fillna(-1).progress_apply(lambda row:int(str(row).split('p')[-1]))
from sklearn.preprocessing import LabelEncoder
feature['color'] = LabelEncoder().fit_transform(data['color'])
feature['color_length'] = data['color'].progress_apply(lambda row:len(row))
def get_color(row):
    if row[-1] == '色':
        if len(row) == 3:
            return row[1:]
        return row
    else:
        return row[-1] + str('色')
data['color_deal'] = data['color'].progress_apply(lambda row:get_color(row))
data['color_deal'] = data['color_deal'].replace('母色', '光色').replace('境色', '光色').replace('版色', '光色').replace('槟色', '橘色').replace('翠色', '绿色').replace('蝶色', '粉色')
feature['color_last'] = LabelEncoder().fit_transform(data['color_deal'])
feature['ct'] = LabelEncoder().fit_transform(data['ct'].fillna('无'))
feature['carrier'] = LabelEncoder().fit_transform(data['carrier'])
feature['os'] = data['os']
feature['os_1'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[0]))
feature['os_2'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[-1]))
for col in ['city', 'prodName', 'color', 'color_last', 'os', 'ct']:
    feature[col] = feature[col].map(feature[col].value_counts().rank()/len(feature[col].unique()))
feature.to_csv('feature/f2.csv', index=False)
feature

100%|██████████| 5000000/5000000 [00:06<00:00, 729419.23it/s]
100%|██████████| 5000000/5000000 [00:06<00:00, 728946.94it/s]
100%|██████████| 5000000/5000000 [00:04<00:00, 1138312.35it/s]
100%|██████████| 5000000/5000000 [00:05<00:00, 888918.69it/s]
100%|██████████| 5000000/5000000 [00:07<00:00, 661803.69it/s]
100%|██████████| 5000000/5000000 [00:07<00:00, 637683.47it/s]


Unnamed: 0,gender,ramCapacity,ramLeftRation,romCapacity,romLeftRation,fontSize,os,city,prodName,color,color_length,color_last,ct,carrier,os_1,os_2
0,1,3.0,,34.0,0.89,1.30001,0.944444,0.776163,0.807018,0.991453,3,1.000000,1.000000,0,9,1
1,0,4.0,0.29,68.0,0.64,1.30001,0.888889,1.000000,0.701754,0.914530,2,1.000000,0.833333,2,8,0
2,1,4.0,0.24,64.0,0.34,1.00000,0.888889,0.912791,0.956140,1.000000,3,1.000000,1.000000,1,8,0
3,0,4.0,0.26,137.0,0.67,1.00000,0.944444,0.950581,0.947368,0.957265,3,0.888889,0.833333,0,9,1
4,0,3.0,0.45,34.0,0.49,,0.888889,0.465116,0.473684,0.965812,2,0.944444,0.500000,0,8,0
5,1,6.0,0.36,128.0,0.77,1.30001,0.944444,0.950581,1.000000,0.905983,3,0.944444,0.833333,2,9,1
6,0,4.0,0.32,68.0,0.71,1.30001,0.888889,0.648256,0.692982,1.000000,3,1.000000,1.000000,1,8,0
7,0,2.0,0.44,16.0,0.54,1.15000,0.777778,0.382267,0.403509,0.760684,3,0.666667,0.833333,0,6,0
8,0,2.0,0.53,16.0,,,0.777778,0.869186,0.657895,0.965812,2,0.944444,1.000000,0,6,0
9,0,6.0,,128.0,0.84,1.15000,0.944444,0.296512,0.491228,0.555556,3,0.888889,1.000000,1,9,1


In [5]:
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# 用户基础特征
data = get_user_app_actived()
data = pd.merge(id_label_data, data, on='uId', how='left')
feature = pd.DataFrame()
feature['active_len'] = data['appId'].progress_apply(lambda row:len(str(row).split('#')))
feature.to_csv('feature/f3.csv', index=False)
feature

100%|██████████| 5000000/5000000 [00:12<00:00, 403278.34it/s]


Unnamed: 0,active_len
0,6
1,21
2,16
3,45
4,18
5,26
6,12
7,12
8,9
9,33


In [None]:
# f4 大表特征集 period
packtime_all = get_user_app_usage(less_data=False)
packtime_all.columns = ['device_id', 'app', 'peroid', 'times', 'start']

train_data = pd.read_csv(path + 'age_train.csv', header=None)
test_data = pd.read_csv(path + 'age_test.csv', header=None)
del train_data[1]
train_data.columns = ['device_id']
test_data.columns = ['device_id']

d1 = train_data[:500000]
d2 = train_data[500000:1000000]
d3 = train_data[1000000:1500000]
d4 = train_data[1500000:]
d5 = test_data

df_value = []
for i in tqdm([d1, d2, d3, d4, d5]):
    packtime = pd.merge(i, packtime_all, on='device_id', how='left')
    packtime = packtime.fillna(0)
    packtime['app'] = packtime['app'].astype(str)
    packtime['start'] = pd.to_datetime(packtime['start'])
    packtime['date'] = packtime['start'].dt.date
    packtime['dayofweek'] = packtime['start'].dt.dayofweek
    #平均每天使用设备时间
    dtime = packtime.groupby(['device_id', 'date'])['peroid'].agg('sum')
    #不同时间段占比
    wtime = packtime.groupby(['device_id', 'dayofweek'])['peroid'].agg('sum')
    atime = packtime.groupby(['device_id', 'app'])['peroid'].agg('sum')

    dapp = packtime[['device_id', 'date', 'app']].drop_duplicates().groupby(
        ['device_id', 'date'])['app'].agg(' '.join)
    dapp = dapp.reset_index()
    dapp['app_len'] = dapp['app'].apply(lambda x: x.split(' ')).apply(len)
    dapp_stat = dapp.groupby('device_id')['app_len'].agg(
        {'std': 'std', 'mean': 'mean', 'max': 'max'})
    dapp_stat = dapp_stat.reset_index()
    dapp_stat.columns = ['device_id', 'app_len_std', 'app_len_mean', 'app_len_max']

    dtime = dtime.reset_index()
    dtime_stat = dtime.groupby(['device_id'])['peroid'].agg(
        {'sum': 'sum', 'mean': 'mean', 'std': 'std', 'max': 'max'}).reset_index()
    dtime_stat.columns = ['device_id', 'date_sum',
                          'date_mean', 'date_std', 'date_max']

    wtime = wtime.reset_index()
    weektime = wtime.pivot(
        index='device_id', columns='dayofweek', values='peroid').fillna(0)
    weektime.columns = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    weektime.reset_index(inplace=True)

    atime = atime.reset_index()
    app = atime.groupby(['device_id'])['peroid'].idxmax()

    user = pd.merge(dapp_stat, dtime_stat, on='device_id', how='left')
    user = pd.merge(user, weektime, on='device_id', how='left')
    user = pd.merge(user, atime.iloc[app], on='device_id', how='left')
    del user['device_id']
    df_value.append(user)
    gc.collect()
del packtime_all
feature = pd.concat([df_value[0], df_value[1], df_value[2], df_value[3], df_value[4]], axis=0, sort=False)
from sklearn.preprocessing import LabelEncoder
feature['app'] = LabelEncoder().fit_transform(feature['app'])
feature.to_csv('feature/f4.csv', index=False)
feature

 60%|██████    | 3/5 [22:23<14:42, 441.09s/it]

In [None]:
# f5
packtime_all = get_user_app_usage(less_data=False)
packtime_all.columns = ['device_id', 'app', 'abcd', 'peroid', 'start']

train_data = pd.read_csv(path + 'age_train.csv', header=None)
test_data = pd.read_csv(path + 'age_test.csv', header=None)
del train_data[1]
train_data.columns = ['device_id']
test_data.columns = ['device_id']

d1 = train_data[:500000]
d2 = train_data[500000:1000000]
d3 = train_data[1000000:1500000]
d4 = train_data[1500000:]
d5 = test_data

df_value = []
for i in tqdm([d1, d2, d3, d4, d5]):
    packtime = pd.merge(i, packtime_all, on='device_id', how='left')
    packtime = packtime.fillna(0)
    packtime['app'] = packtime['app'].astype(str)
    packtime['start'] = pd.to_datetime(packtime['start'])
    packtime['date'] = packtime['start'].dt.date
    packtime['dayofweek'] = packtime['start'].dt.dayofweek
    #平均每天使用设备时间
    dtime = packtime.groupby(['device_id', 'date'])['peroid'].agg('sum')
    #不同时间段占比
    wtime = packtime.groupby(['device_id', 'dayofweek'])['peroid'].agg('sum')
    atime = packtime.groupby(['device_id', 'app'])['peroid'].agg('sum')

    dapp = packtime[['device_id', 'date', 'app']].drop_duplicates().groupby(
        ['device_id', 'date'])['app'].agg(' '.join)
    dapp = dapp.reset_index()
    dapp['app_len'] = dapp['app'].apply(lambda x: x.split(' ')).apply(len)
    dapp_stat = dapp.groupby('device_id')['app_len'].agg(
        {'std': 'std', 'mean': 'mean', 'max': 'max'})
    dapp_stat = dapp_stat.reset_index()
    dapp_stat.columns = ['device_id', 'app_len_std', 'app_len_mean', 'app_len_max']

    dtime = dtime.reset_index()
    dtime_stat = dtime.groupby(['device_id'])['peroid'].agg(
        {'sum': 'sum', 'mean': 'mean', 'std': 'std', 'max': 'max'}).reset_index()
    dtime_stat.columns = ['device_id', 'date_sum',
                          'date_mean', 'date_std', 'date_max']

    wtime = wtime.reset_index()
    weektime = wtime.pivot(
        index='device_id', columns='dayofweek', values='peroid').fillna(0)
    weektime.columns = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    weektime.reset_index(inplace=True)

    atime = atime.reset_index()
    app = atime.groupby(['device_id'])['peroid'].idxmax()

    user = pd.merge(dapp_stat, dtime_stat, on='device_id', how='left')
    user = pd.merge(user, weektime, on='device_id', how='left')
    user = pd.merge(user, atime.iloc[app], on='device_id', how='left')
    
    df_value.append(user)
    gc.collect()
del packtime_all
feature = pd.concat([df_value[0], df_value[1], df_value[2], df_value[3], df_value[4]], axis=0, sort=False)
from sklearn.preprocessing import LabelEncoder
feature['app'] = LabelEncoder().fit_transform(feature['app'])
del feature['app_len_std'], feature['app_len_mean'], feature['app_len_max'], feature['device_id']
new_columns = []
for i in feature.columns:
    new_columns.append(i + '_times')
feature.columns = new_columns
feature.to_csv('feature/f5.csv', index=False)