In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
info = pd.read_csv("../data/app_info.csv",names=['appid','category'])
active = pd.read_pickle("../pickle/user_app_active.pickle")
# usage = pd.read_pickle("../input2/user_app_usage.pickle")#,names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])
user_basic_info = pd.read_csv("../data/user_basic_info.csv",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])
behavior_info = pd.read_csv("../data/user_behavior_info.csv",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])
# (train.shape,test.shape),(info.shape,active.shape,usage.shape,user_basic_info.shape,behavior_info.shape)

In [3]:
all_data = train.append(test).reset_index(drop=True)
all_data.head()

Unnamed: 0,age_group,uid
0,4.0,1000006
1,4.0,1000009
2,5.0,1000010
3,5.0,1000012
4,4.0,1000027


In [5]:
def get_category(x):
    col = []
    no_col = 0
    for i in x:
        try:
            col.append(hash_dict[i])
        except:
            no_col+=1
    return col,no_col

hash_dict = dict(info.values)
active['category'] = active['appid'].map(lambda x:get_category(x))
active['category_nan']  = active['category'].map(lambda x:x[1])
active['category']  = active['category'].map(lambda x:x[0])
active['category_len'] = active['category'].map(lambda x:len(x))
active['category_nunique'] = active['category'].map(lambda x:len(set(x)))
active['category_ratio'] = active['category_nunique']/active['category_len']
del active['category']

In [6]:
all_data = all_data.merge(user_basic_info,how='left',on=['uid'])
all_data = all_data.merge(behavior_info,how='left',on=['uid'])
all_data = all_data.merge(active,how='left',on=['uid'])
feature_name = [i for i in all_data.columns if i not in ['uid','age_group']]
all_data.shape,len(all_data)

((5000000, 28), 5000000)

In [None]:
all_data['city_count_user'] = all_data.groupby(['city'])['uid'].transform('count')
all_data['romleftration_count_user'] = all_data.groupby(['romleftration'])['uid'].transform('count')
all_data['prodname_count_user'] = all_data.groupby(['prodname'])['uid'].transform('count')
all_data['color_count_user'] = all_data.groupby(['color'])['uid'].transform('count')
all_data['ct_count_user'] = all_data.groupby(['ct'])['uid'].transform('count')
all_data['carrier_count_user'] = all_data.groupby(['carrier'])['uid'].transform('count')

all_data['city_nunique_user'] = all_data.groupby(['city'])['uid'].transform('nunique')
all_data['romleftration_nunique_user'] = all_data.groupby(['romleftration'])['uid'].transform('nunique')
all_data['prodname_nunique_user'] = all_data.groupby(['prodname'])['uid'].transform('nunique')
all_data['ct_nunique_user'] = all_data.groupby(['ct'])['uid'].transform('nunique')
all_data['carrier_nunique_user'] = all_data.groupby(['carrier'])['uid'].transform('nunique')

In [9]:
all_data.head()

Unnamed: 0,age_group,uid,gender,city,prodname,ramcapacity,ramleftration,romcapacity,romleftration,color,...,romleftration_count_user,prodname_count_user,color_count_user,ct_count_user,carrier_count_user,city_nunique_user,romleftration_nunique_user,prodname_nunique_user,ct_nunique_user,carrier_nunique_user
0,4.0,1000006,1,c00253,p0054,8.0,,128.0,,翡冷翠,...,,71149,53644,2514879.0,2463517,77071.0,,71149,2514879.0,2463517
1,4.0,1000009,0,c0043,p0018,8.0,0.22,256.0,0.49,渐变黑,...,54080.0,22495,13283,2514879.0,2463517,213066.0,54080.0,22495,2514879.0,2463517
2,5.0,1000010,0,c00284,p0054,8.0,0.38,128.0,0.04,翡冷翠,...,49470.0,71149,53644,2514879.0,2463517,4897.0,49470.0,71149,2514879.0,2463517
3,5.0,1000012,0,c0087,p0059,4.0,0.34,64.0,0.21,香槟金,...,48572.0,96397,306158,2077881.0,1126914,30551.0,48572.0,96397,2077881.0,1126914
4,4.0,1000027,0,c00206,p001,6.0,0.26,137.0,0.79,海鸥灰,...,53404.0,147835,58108,2077881.0,1126914,117881.0,53404.0,147835,2077881.0,1126914


In [10]:
feature_name = [i for i in all_data.columns if i not in ['uid','age_group']]
cat_col = [col for col in all_data.columns if all_data[col].dtype == np.object]
num_col = [i for i in feature_name  if i not in cat_col]

In [31]:
label_name = ['age_group']
from tqdm import tqdm
from scipy import sparse
vector_feature = ['appid']
onehot_feature =  [i for i in cat_col if i not in vector_feature]

In [12]:
tr = None
train_ix = list(range(train.shape[0]))
test_ix = list(range(train.shape[0],all_data.shape[0]))

In [None]:
print('onehot...')
enc = OneHotEncoder(handle_unknown='ignore')
for feature in tqdm(onehot_feature):
    lbl = LabelEncoder()
    all_data[feature] = lbl.fit_transform(all_data[feature].astype('str').fillna('0').values.reshape(-1, 1))

In [22]:
from scipy import sparse

c1 = sparse.load_npz("../vector/Sparse_Matrix/active_count.npz")
c2 = sparse.load_npz("../vector/Sparse_Matrix/active_tfidf.npz")

In [29]:
svd = TruncatedSVD(n_components=100,n_iter=10,random_state=2019)
c1_svd = svd.fit_transform(c1)
c2_svd = svd.fit_transform(c2)

In [32]:
print('cv...')
cv = TfidfVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=2)
for feature in tqdm(vector_feature):
    cv.fit(all_data[feature].astype('str'))
    train_a = cv.transform(all_data[feature].astype('str').fillna("##").loc[train_ix])
    test_a = cv.transform(all_data[feature].astype('str').fillna("##").loc[test_ix])
    if tr is None:
        tr = train_a
        te = test_a
    else:
        tr = sparse.hstack((tr, train_a), 'csr')
        te = sparse.hstack((te, test_a), 'csr')    
print(tr.shape,te.shape)
    
cv = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=2)
for feature in tqdm(vector_feature):
    cv.fit(all_data[feature].astype('str'))
    train_a = cv.transform(all_data[feature].astype('str').fillna("##").loc[train_ix])
    test_a = cv.transform(all_data[feature].astype('str').fillna("##").loc[test_ix])
    tr = sparse.hstack((tr, train_a), 'csr')
    te = sparse.hstack((te, test_a), 'csr')

print(tr.shape,te.shape)

feature_name = [i for i in all_data.columns if i not in ['uid','age_group']]
tr = sparse.hstack((tr,all_data.loc[train_ix][num_col]),'csr')
te = sparse.hstack((te,all_data.loc[test_ix][num_col]),'csr')
print(tr.shape,te.shape)



  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [06:23<00:00, 383.46s/it][A[A


(4000000, 19888) (1000000, 19888)
(4000000, 19919) (1000000, 19919)


In [36]:
import catboost as cbt
def acc_score(labels, preds):

#     preds = check_prob(preds.reshape(12,-1))
#     preds = np.argmax(preds, axis=0)
    preds = np.argmax(preds, axis=1)
    score = accuracy_score(y_true=preds, y_pred=labels)
    return 'acc_score', score, True

tr_index = ~all_data[label_name].isnull()
X_train = tr#all_data[tr_index][list(set(feature_name))].reset_index(drop=True)
y = train['age_group'].values - 1
X_test = te#all_data[~tr_index][list(set(feature_name))].reset_index(drop=True)
final_pred = []
cv_score = []
cv_model = []

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=128, reg_alpha=0.1, reg_lambda=1,
        max_depth=-1, n_estimators=3000, objective='multiclass',num_class=6,
        subsample=0.5, colsample_bytree=0.5, subsample_freq=1,min_child_samples=20,
        learning_rate=0.1, random_state=2019 + index, n_jobs=50, metric="multi_error", importance_type='gain'
    )
#     train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    train_x, test_x, train_y, test_y = X_train[train_index], X_train[test_index], y[train_index], y[test_index]
    eval_set = [(test_x, test_y)]
    lgb_model.fit(train_x, train_y, eval_set=eval_set,verbose=10,early_stopping_rounds=100)
    # cbt_model = cbt.CatBoostClassifier(n_estimators=3000,learning_rate=0.1,max_depth=7,colsample_bytree=0.5,objective='multi_class',verbose=10,early_stopping_rounds=100,task_type='GPU',eval_metric='Accuracy')
    # cbt_model.fit(train_x, train_y,eval_set=[test_x,test_y])
    cv_model.append([lgb_model])
    y_test = lgb_model.predict(X_test)
    y_val = lgb_model.predict_proba(test_x)
    cv_score.append(acc_score(test_y,y_val)[1])
    if index == 0:
        final_pred = np.array(y_test).reshape(-1, 1)
    else:
        final_pred = np.hstack((final_pred, np.array(y_test).reshape(-1, 1)))

print("LGBM : ",np.mean(cv_score))

0
Training until validation scores don't improve for 100 rounds.
[10]	valid_0's multi_error: 0.5097
[20]	valid_0's multi_error: 0.495596
[30]	valid_0's multi_error: 0.484979
[40]	valid_0's multi_error: 0.475964
[50]	valid_0's multi_error: 0.46824
[60]	valid_0's multi_error: 0.461814
[70]	valid_0's multi_error: 0.456118
[80]	valid_0's multi_error: 0.450941
[90]	valid_0's multi_error: 0.446739
[100]	valid_0's multi_error: 0.443093
[110]	valid_0's multi_error: 0.439866
[120]	valid_0's multi_error: 0.43687
[130]	valid_0's multi_error: 0.434149
[140]	valid_0's multi_error: 0.432026
[150]	valid_0's multi_error: 0.430082
[160]	valid_0's multi_error: 0.428454
[170]	valid_0's multi_error: 0.426683
[180]	valid_0's multi_error: 0.425089
[190]	valid_0's multi_error: 0.423586
[200]	valid_0's multi_error: 0.422318
[210]	valid_0's multi_error: 0.421269
[220]	valid_0's multi_error: 0.41996
[230]	valid_0's multi_error: 0.418861
[240]	valid_0's multi_error: 0.417895
[250]	valid_0's multi_error: 0.417161

In [72]:
# skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
cv_pred_lgb = np.zeros((X_train.shape[0],6))
test_pred_lgb = np.zeros((X_test.shape[0],6))

for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    train_x, test_x, train_y, test_y = X_train[train_index], X_train[test_index], y[train_index], y[test_index]
    y_val = cv_model[index][0].predict_proba(test_x)
    cv_pred_lgb[test_index] = y_val
    print(y_val[:5])
    test_pred_lgb += cv_model[index][0].predict_proba(X_test) / 5

0
[[0.00299943 0.00118632 0.18409089 0.69738263 0.10134583 0.01299489]
 [0.01881538 0.03866722 0.5839238  0.30704702 0.04991326 0.0016333 ]
 [0.03562736 0.00789534 0.14133247 0.44927837 0.27335493 0.09251153]
 [0.02075523 0.03929126 0.14327357 0.17981842 0.23081689 0.38604464]
 [0.12674667 0.81609541 0.03854113 0.00866026 0.00854215 0.00141439]]
1
[[0.00407134 0.00123642 0.02171078 0.27080535 0.59786133 0.10431478]
 [0.00192695 0.00849684 0.07856286 0.22905042 0.64224228 0.03972065]
 [0.0650815  0.04353347 0.11124417 0.07979386 0.22312841 0.47721859]
 [0.10625459 0.29177171 0.1913272  0.19076278 0.18418211 0.0357016 ]
 [0.422402   0.24891585 0.13363914 0.08023887 0.06219008 0.05261405]]
2
[[5.86002423e-03 2.98108478e-02 9.53930247e-02 1.33689212e-01
  4.33755096e-01 3.01491795e-01]
 [1.61726244e-01 8.21122060e-01 1.18322664e-02 2.52182633e-03
  2.37350385e-03 4.24099114e-04]
 [1.50777118e-01 1.43257880e-01 3.42157920e-02 4.02181710e-01
  1.69174384e-01 1.00393116e-01]
 [4.76134161e-01 

In [64]:
%%time
k = cv_model[0][0].predict_proba(test_x)

CPU times: user 1h 37min 51s, sys: 57 s, total: 1h 38min 48s
Wall time: 1min 58s


In [75]:
oof = pd.DataFrame(cv_pred_lgb)
oof['uid'] = train['uid'].values

pred = pd.DataFrame(test_pred_lgb)
pred['uid'] = test['uid'].values