In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"

train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
active = pd.read_pickle("{}/user_app_active.pickle".format(pickle_path))
usage_appid_seq = pd.read_pickle("{}/user_app_seq.pickle".format(pickle_path))
# print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))

all_data = train.append(test)
all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)
print(all_data.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.linear_model import LogisticRegression,BayesianRidge,SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold,TimeSeriesSplit
from scipy import sparse
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt

def get_sklearn_embedding(now,n_splits=5,ngram=1,prefix=None):
    
    if os.path.exists("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix)):
        return pd.read_pickle("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix))
    else:
        df = now.copy()
        df['appid'] = df['appid'].map(lambda x:" ".join(x))
        df = df.merge(all_data,how='right',on='uid')
        print(df.head())
        tfidf = TfidfVectorizer(ngram_range=(1,ngram))
        tf = tfidf.fit_transform(df['appid'].fillna("##").values)
        count = CountVectorizer(ngram_range=(1,ngram))
        cv = count.fit_transform(df['appid'].fillna("##").values)
        all_ = sparse.csr_matrix(sparse.hstack([tf, cv]))
        print("TFIDF & COUNT FINISHED...")
        tr = df['age_group'].notnull()
        te = df['age_group'].isnull()
        y = df[tr]['age_group']-1
        X_train = all_[df[tr].index]
        X_test = all_[df[te].index]

        random_seed = 2019
        model_zoo = [SGDClassifier(n_jobs=10,verbose=1),SGDClassifier(loss='log',n_jobs=10,verbose=1),
                     SGDClassifier(loss='modified_huber',n_jobs=10,verbose=1),
                     PassiveAggressiveClassifier(n_jobs=10,verbose=1),LogisticRegression(C=10),
                     RidgeClassifier(solver='lsqr',fit_intercept=False),LinearSVC(verbose=1,max_iter=500),
                     BernoulliNB(),MultinomialNB()]

        columns = ['SGD_HINGE','SGD_LOG','SGD_HUBER','PAC','LR','RIDGE','LSVC','BNB','MNB']

        oof = []
        count = 0

        for model in model_zoo:
            t1 = time.time()
            cv_pred_stack = np.zeros((X_train.shape[0],num_classes))
            test_pred_stack = np.zeros((X_test.shape[0],num_classes))
            skf = KFold(n_splits=n_splits,random_state=random_seed)
            if os.path.exists("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count])):
                tmp = pd.read_pickle("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count]))
            else:
                for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
                    print(index,model)
                    train_x, test_x, train_y, test_y = X_train[train_index], X_train[test_index], y.iloc[train_index], y.iloc[test_index]
                    model.fit(train_x,train_y)
                    try:
                        y_val = model._predict_proba_lr(test_x)
                    except:
                        y_val = model.predict_proba(test_x)
                    cv_pred_stack[test_index] = y_val
                    print(y_val.shape)
                    try:
                        test_pred_stack += model._predict_proba_lr(X_test) / n_splits
                    except:
                        test_pred_stack += model.predict_proba(X_test) / n_splits
                print(model,'score:',accuracy_score(y,np.argmax(cv_pred_stack,axis=1)))
                print(time.time()-t1)
                a = pd.DataFrame(cv_pred_stack).add_prefix(columns[count]+"_")
                a['uid'] = df[tr]['uid'].values
                b = pd.DataFrame(test_pred_stack).add_prefix(columns[count]+"_")
                b['uid'] = df[te]['uid'].values
                tmp = a.append(b).sort_values(by=['uid']).reset_index(drop=True)
                tmp.to_pickle("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count]))
                
            count += 1
            oof.append(tmp)
  
        df_agg = pd.DataFrame()
        for i in tqdm(oof):
            df_agg[i.columns] = i
        df_agg = df_agg.sort_values(by=['uid'],ascending=True)
        df_agg.to_pickle("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix))
    
    return df_agg

num_classes = 6
prob_active = get_sklearn_embedding(active,n_splits=5,ngram=1,prefix='active')
prob_usage = get_sklearn_embedding(usage_appid_seq,n_splits=5,ngram=1,prefix='usage')

       uid                                              appid  app_len  \
0  1000006  a001012 a001036 a001062 a001172 a001275 a00135...     47.0   
1  1000009  a001012 a001015 a001055 a001062 a00107 a001072...     73.0   
2  1000010  a001012 a001036 a001050 a001055 a001062 a00107...     96.0   
3  1000011  a001012 a001063 a002450 a003083 a00326 a003987...     21.0   
4  1000012  a001036 a001062 a001580 a001583 a003570 a00365...     33.0   

   age_group  
0        4.0  
1        4.0  
2        5.0  
3        NaN  
4        5.0  
TFIDF & COUNT FINISHED...
0
