In [1]:
import json
import pandas as pd
import os
from tqdm import *
import re
import numpy as np
import gc

In [2]:
path = 'data/'

In [3]:
# 读入数据（需加速）
def get_age_data():
    train_data = pd.read_csv(path + 'age_train.csv', header=None)
    test_data = pd.read_csv(path + 'age_test.csv', header=None)
    data = pd.concat([train_data, test_data], axis=0, sort=False).fillna(-1)
    data.columns = ['uId', 'age_group']
    return data

def get_user_app_actived():
    data = pd.read_csv(path + 'user_app_actived.csv', header=None)
    data.columns = ['uId', 'appId']
    return data

def get_user_behavior_info():
    data = pd.read_csv(path + 'user_behavior_info.csv', header=None)
    data.columns = ['uId', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes',
                   'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']
    return data

def get_user_basic_info():
    data = pd.read_csv(path + 'user_basic_info.csv', header=None)
    data.columns = ['uId', 'gender', 'city', 'prodName', 'ramCapacity', 
                   'ramLeftRation', 'romCapacity', 'romLeftRation', 'color',
                   'fontSize', 'ct', 'carrier', 'os']
    return data

def get_app_info():
    data = pd.read_csv(path + 'app_info.csv', header=None)
    data.columns = ['appId', 'category']
    return data

# 测试的时候用True
# 提特征改用False
def get_user_app_usage(less_data=False):
    if less_data:
        reader = pd.read_csv(path + 'user_app_usage.csv', chunksize=2000000)
        for i in reader:
            data = i
            break
    else:
        data = pd.read_csv(path + 'user_app_usage.csv', header=None)
    data.columns = ['uId', 'appId', 'duration', 'times', 'use_date']
    return data

In [None]:
# f1
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# app激活表计算tfidf作为特征
data = get_user_app_actived()
data = pd.merge(id_label_data, data, on='uId', how='left')
data = data.fillna('无')
data['appId'] = data['appId'].progress_apply(lambda row: str(row).replace('#', ' '))
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import roc_auc_score
df_train = data[data['age_group'] != -1]
df_test = data[data['age_group']==-1]

############################ 加载数据 ############################
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['appId'] = data['appId'].apply(lambda row:str(row))

############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(ngram_range=(1, 1))
discuss_tf = tf.fit_transform(data['appId']).tocsr()
print('计算结束')

############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = discuss_tf[:len(df_train)]
score = df_train['age_group'] - 1
test_feature = discuss_tf[len(df_train):]
print('处理完毕')

######################### 模型函数(返回sklean_stacking结果) ########################
def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):
    print('\n****开始跑', model_name, '****')
    stack_train = np.zeros((train_num, class_number))
    stack_test = np.zeros((test_num, class_number))
    score_mean = []
    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)
    tqdm.desc = model_name
    for i, (tr, va) in enumerate(skf.split(train_feature, score)):
        clf.fit(train_feature[tr], score[tr])
        score_va = clf._predict_proba_lr(train_feature[va])
        score_te = clf._predict_proba_lr(test_feature)
        score_single = accuracy_score(score[va], np.argmax(clf._predict_proba_lr(train_feature[va]), axis=1))
        score_mean.append(np.around(score_single, 5))
        print(f'{i+1}/{n_folds}', score_single)
        stack_train[va] += score_va
        stack_test += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()
    for i in range(stack.shape[1]):
        df_stack['tfidf_ori_1_1_' + model_name + '_classfiy_{}'.format(i)] = stack[:, i]
    print(model_name, '处理完毕')
    return df_stack, score_mean

model_list = [
    ['LogisticRegression', LogisticRegression(random_state=1017, C=3)],
    ['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],
    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],
    ['RidgeClassfiy', RidgeClassifier(random_state=1017)],
    ['LinearSVC', LinearSVC(random_state=1017)]
]

feature = pd.DataFrame()
for i in model_list:
    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 6, 5, len(df_train), len(df_test))
    feature = pd.concat([feature, stack_result], axis=1, sort=False)
    print('五折结果', score_mean)
    print('平均结果', np.mean(score_mean))
feature.to_csv('lgb_feature/f1.csv', index=False)

100%|██████████| 5000000/5000000 [00:12<00:00, 398634.17it/s]


开始计算tf-idf特征
计算结束
开始进行一些前期处理
处理完毕

****开始跑 LogisticRegression ****
1/5 0.55787625
2/5 0.55755
3/5 0.5568125
4/5 0.5575725
5/5 0.55856
LogisticRegression 处理完毕
五折结果 [0.55788, 0.55755, 0.55681, 0.55757, 0.55856]
平均结果 0.557674

****开始跑 SGDClassifier ****
1/5 0.5381175
2/5 0.53828625
3/5 0.53728625
4/5 0.537975
5/5 0.539605
SGDClassifier 处理完毕
五折结果 [0.53812, 0.53829, 0.53729, 0.53798, 0.5396]
平均结果 0.5382560000000001

****开始跑 PassiveAggressiveClassifier ****
1/5 0.44627375
2/5 0.445645
3/5 0.44823625
4/5 0.44563
5/5 0.44951
PassiveAggressiveClassifier 处理完毕
五折结果 [0.44627, 0.44564, 0.44824, 0.44563, 0.44951]
平均结果 0.447058

****开始跑 RidgeClassfiy ****
1/5 0.542295
2/5 0.5423875
3/5 0.54163875
4/5 0.54186375


In [None]:
# f2
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# app激活表计算tfidf作为特征
data = get_user_app_actived()
data = pd.merge(id_label_data, data, on='uId', how='left')
data = data.fillna('无')
data['appId'] = data['appId'].progress_apply(lambda row: str(row).replace('#', ' '))
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import roc_auc_score
df_train = data[data['age_group'] != -1]
df_test = data[data['age_group']==-1]

############################ 加载数据 ############################
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['appId'] = data['appId'].apply(lambda row:str(row))

############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(ngram_range=(1, 1), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
discuss_tf = tf.fit_transform(data['appId']).tocsr()
print('计算结束')

############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = discuss_tf[:len(df_train)]
score = df_train['age_group'] - 1
test_feature = discuss_tf[len(df_train):]
print('处理完毕')

######################### 模型函数(返回sklean_stacking结果) ########################
def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):
    print('\n****开始跑', model_name, '****')
    stack_train = np.zeros((train_num, class_number))
    stack_test = np.zeros((test_num, class_number))
    score_mean = []
    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)
    tqdm.desc = model_name
    for i, (tr, va) in enumerate(skf.split(train_feature, score)):
        clf.fit(train_feature[tr], score[tr])
        score_va = clf._predict_proba_lr(train_feature[va])
        score_te = clf._predict_proba_lr(test_feature)
        score_single = accuracy_score(score[va], np.argmax(clf._predict_proba_lr(train_feature[va]), axis=1))
        score_mean.append(np.around(score_single, 5))
        print(f'{i+1}/{n_folds}', score_single)
        stack_train[va] += score_va
        stack_test += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()
    for i in range(stack.shape[1]):
        df_stack['tfidf_1_1_' + model_name + '_classfiy_{}'.format(i)] = stack[:, i]
    print(model_name, '处理完毕')
    return df_stack, score_mean

model_list = [
    ['LogisticRegression', LogisticRegression(random_state=1017, C=3)],
    ['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],
    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],
    ['RidgeClassfiy', RidgeClassifier(random_state=1017)],
    ['LinearSVC', LinearSVC(random_state=1017)]
]

feature = pd.DataFrame()
for i in model_list:
    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 6, 5, len(df_train), len(df_test))
    feature = pd.concat([feature, stack_result], axis=1, sort=False)
    print('五折结果', score_mean)
    print('平均结果', np.mean(score_mean))
feature.to_csv('lgb_feature/f2.csv', index=False)

In [None]:
# f3
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# 行为特征
data = get_user_behavior_info()
data = pd.merge(id_label_data, data, on='uId', how='left')
import warnings
warnings.filterwarnings("ignore")
feature = pd.DataFrame()
for i in data.columns:
    if i not in ['age_group', 'uId']:
        feature[i] = data[i]
feature.to_csv('lgb_feature/f3.csv', index=False)

In [None]:
# f4
id_label_data = get_age_data()
tqdm.pandas('获取特征')
# 用户基础特征
data = get_user_basic_info()
data = pd.merge(id_label_data, data, on='uId', how='left')
import warnings
warnings.filterwarnings("ignore")
feature = data[['gender', 'ramCapacity', 'ramLeftRation', 'romCapacity', 'romLeftRation', 'fontSize', 'os']]
feature['city'] = data['city'].fillna(-1).progress_apply(lambda row:int(str(row).split('c')[-1]))
feature['prodName'] = data['prodName'].fillna(-1).progress_apply(lambda row:int(str(row).split('p')[-1]))
from sklearn.preprocessing import LabelEncoder
feature['color'] = LabelEncoder().fit_transform(data['color'])
feature['color_length'] = data['color'].progress_apply(lambda row:len(row))
def get_color(row):
    if row[-1] == '色':
        if len(row) == 3:
            return row[1:]
        return row
    else:
        return row[-1] + str('色')
data['color_deal'] = data['color'].progress_apply(lambda row:get_color(row))
data['color_deal'] = data['color_deal'].replace('母色', '光色').replace('境色', '光色').replace('版色', '光色').replace('槟色', '橘色').replace('翠色', '绿色').replace('蝶色', '粉色')
feature['color_last'] = LabelEncoder().fit_transform(data['color_deal'])
feature['ct'] = LabelEncoder().fit_transform(data['ct'].fillna('无'))
feature['carrier'] = LabelEncoder().fit_transform(data['carrier'])
feature['os'] = data['os']
feature['os_1'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[0]))
feature['os_2'] = data['os'].fillna(-1).progress_apply(lambda row:int(str(row).split('.')[-1]))
for col in ['city', 'prodName', 'color', 'color_last']:
    feature[col + 'value_rank'] = feature[col].map(feature[col].value_counts().rank()/len(feature[col].unique()))
feature.to_csv('lgb_feature/f4.csv', index=False)

In [None]:
# f5
import shutil
import os
file = 'feature/f4.csv'
file_dir = 'lgb_feature/f5.csv'
shutil.copy(file,file_dir)

In [None]:
# f6
import shutil
import os
file = 'feature/f5.csv'
file_dir = 'lgb_feature/f6.csv'
shutil.copy(file,file_dir)

In [10]:
122

122