In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from scipy.stats import entropy

In [2]:
#连续特征  
def num_add(df,f1,f2):
    #add
    df[f'{f1}_add_{f2}'] = df[f1]+df[f2]
    return df

def num_mul(df,f1,f2):
    #mul
    df[f'{f1}_mul_{f2}'] = df[f1]*df[f2]
    return df

def num_div(df,f1,f2):
    #div
    df[f'{f1}_div_{f2}'] = df[f1]/(df[f2]+df[f2].mean())
    return df

def num_log(df,f):
    #log
    df[f'log_{f}'] = np.log(1+df[f])
    return df

def num_bin(df,f,num_bins=10,bin_type='cut'):
    #bin
    if bin_type=='cut': 
        #等距分箱
        df[f'{f}_bin_{num_bins}'] = pd.cut(df[f],num_bins,labels=False)
    else:
        #等频分箱
        df[f'{f}_bin_{num_bins}'] = pd.qcut(df[f],num_bins,labels=False)
    return df
    

In [3]:
#离散变量特征工程
def one_hot_enc(df,f):
    #one-hot
    enc = OneHotEncoder()
    enc.fit(df[f].values.reshape(-1,1))
    one_hot_array=enc.transform(df[f].values.reshape(-1,1)).toarray()
    df[[f'{f}_one_hot_{i}' for i in range(one_hot_array.shape[1])]] = one_hot_array
    return df

def hash_enc(df,f_list,n_components=8):
    #hash enc
    ce_encoder = ce.HashingEncoder(cols = f_list,n_components=n_components).fit(df)
    df = ce_encoder.transform(df)
    df.rename(columns=zip([f'col_{i}' for i in range(n_components)],[f'hash_enc_{i}' for i in range(n_components)]))
    return df

def count_enc(df,f):
    #count enc
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f'label_enc_{f}'] = df[f].map(map_dict).fillna(-1).astype('int32')
    df[f'{f}_count'] = df[f].map(df[f].value_counts())
    return df

def target_enc(df,f,label):
    #target enc
    df[f'{f}_target_enc'] = df.groupby(f)[label].transform('mean')
    return df

def cross_enc(df,f1,f2):
    #cross enc
    if f'{f1}_count' not in df.columns:
        df = count_enc(df,f1)
    if f'{f2}_count' not in df.columns:
        df = count_enc(df,f2)
    df[f'{f1}_{f2}'] = df[f1].astype('str')+'_'+df[f2].astype('str')
    df = count_enc(df,f'{f1}_{f2}')
    df[f'{f1}_{f2}_count_div_{f1}_count'] = df[f'{f1}_{f2}_count'] / (df[f'{f1}_count']+df[f'{f1}_count'].mean())
    df[f'{f1}_{f2}_count_div_{f2}_count'] = df[f'{f1}_{f2}_count'] / (df[f'{f2}_count']+df[f'{f2}_count'].mean())
    return df
    
def group_stat(df,cat_fea,num_fea):
    for stat in ['min', 'max', 'mean', 'median', 'std', 'skew']:
        df[f'{cat_fea}_{num_fea}_groupby_{stat}'] = df.groupby(cat_fea)[num_fea].transform(stat)
    return df

In [4]:
def tfidf(sentences, output_num, output_prefix='hahaha', seed=1024):
    temp_sentences = []
    for i in range(len(sentences)):
        temp_sentences.append(' '.join(sentences[i]))
    #进行tfidf计算
    tfidf_enc = TfidfVectorizer()
    tfidf_vec = tfidf_enc.fit_transform(temp_sentences)
    #使用SVD进行降维
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

In [5]:
def word2vec(sentences, emb_size=16, output_prefix='lalala'):
    temp_sentences = []
    for i in range(len(sentences)):
        temp_sentences.append([str(x) for x in sentences[i]])
    model = Word2Vec(temp_sentences, size=emb_size, window=5, min_count=2, sg=0, hs=1, seed=4399,workers=-1)
    #对seq里的所有单词的embedding取均值
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    tmp = pd.DataFrame()
    for i in range(emb_size):
        tmp['{}_emb_{}'.format(output_prefix, i)] = emb_matrix[:,i]
    del model, emb_matrix, sentences
    return tmp

In [6]:
#以这个数据量较少的数据为例
df = pd.read_csv('../data/user_interaction_data.csv')
df['item_id'] = df['item_id'].astype('str')
tmp = df.groupby('user_id', as_index=False)['item_id'].agg({'user_item_list': list})
sentences = tmp['user_item_list'].values.tolist()

In [7]:
w2v_emb=word2vec(sentences,16)
tfidf_emb=tfidf(sentences,16)

  "C extension not loaded, training will be slow. "
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [8]:
w2v_emb

Unnamed: 0,lalala_emb_0,lalala_emb_1,lalala_emb_2,lalala_emb_3,lalala_emb_4,lalala_emb_5,lalala_emb_6,lalala_emb_7,lalala_emb_8,lalala_emb_9,lalala_emb_10,lalala_emb_11,lalala_emb_12,lalala_emb_13,lalala_emb_14,lalala_emb_15
0,0.009737,-0.013285,0.004612,0.012407,0.008679,-0.008788,0.020353,0.024181,-0.003551,0.007680,-0.027824,0.022057,0.007029,-0.010251,-0.021674,-0.004625
1,0.002314,-0.013534,-0.025044,0.011010,0.016816,0.007989,0.028191,0.023471,-0.003281,0.015639,0.019876,0.016439,-0.005004,-0.019052,-0.022054,-0.010852
2,0.007548,-0.015330,-0.007102,-0.009415,-0.000247,0.009131,0.004769,0.001233,0.004407,-0.005636,0.012606,-0.003527,0.001552,-0.004034,-0.013886,0.005432
3,0.001581,-0.011654,0.017218,0.029895,-0.011184,0.025948,-0.023319,0.030582,-0.022472,-0.017560,0.015571,0.022475,-0.010324,-0.019503,0.014681,-0.022149
4,0.001913,-0.014091,0.001349,-0.002950,-0.010620,0.018545,-0.016339,-0.011518,-0.008122,-0.025530,0.012575,-0.019031,0.012045,0.006884,0.010833,-0.002046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19941,0.004492,0.012489,-0.002318,0.002493,-0.018478,-0.012618,-0.008383,0.018873,-0.007636,-0.005724,0.001846,-0.004578,0.010843,-0.000241,0.008800,0.003996
19942,0.013696,0.006765,-0.013529,0.022037,0.025384,-0.028061,-0.024155,0.023207,-0.015827,0.026287,0.007273,-0.013410,-0.011183,0.020145,0.018697,-0.000210
19943,-0.001135,-0.029375,-0.002435,-0.028462,-0.000584,-0.029571,-0.003909,0.025421,-0.010802,-0.022230,-0.030530,0.001630,0.004077,-0.011616,0.007434,-0.017574
19944,-0.013306,0.000348,0.003029,-0.008562,0.003975,-0.001076,0.005266,-0.006625,0.001759,0.002467,0.001869,-0.000225,0.010673,-0.004576,-0.001478,-0.001898


In [9]:
tfidf_emb

Unnamed: 0,hahaha_tfidf_0,hahaha_tfidf_1,hahaha_tfidf_2,hahaha_tfidf_3,hahaha_tfidf_4,hahaha_tfidf_5,hahaha_tfidf_6,hahaha_tfidf_7,hahaha_tfidf_8,hahaha_tfidf_9,hahaha_tfidf_10,hahaha_tfidf_11,hahaha_tfidf_12,hahaha_tfidf_13,hahaha_tfidf_14,hahaha_tfidf_15
0,6.951716e-08,3.917873e-08,-3.269172e-08,1.136269e-08,-1.495753e-07,1.368356e-07,1.681983e-08,-3.668431e-07,1.156786e-06,8.656912e-06,1.131949e-07,1.430107e-07,1.435050e-07,3.817728e-03,1.168357e-07,-4.821589e-07
1,8.020730e-07,2.144746e-07,4.010786e-07,-8.124820e-08,8.889110e-07,3.344838e-07,-6.957637e-11,1.075694e-07,-2.627144e-07,1.781419e-08,5.124149e-07,-4.273089e-07,1.068066e-06,2.248585e-10,-3.738512e-09,9.030862e-08
2,3.573482e-04,5.802607e-05,-4.822030e-04,8.521284e-04,4.543415e-05,2.813493e-04,5.862667e-06,-1.569175e-04,-1.031083e-04,-6.369409e-06,-1.491733e-06,2.198415e-05,9.627415e-06,-1.921810e-06,3.870254e-05,2.807317e-04
3,1.111446e-13,7.613448e-14,-7.253641e-13,8.157676e-12,9.442730e-13,1.153886e-13,3.749019e-14,1.537246e-11,5.335708e-12,-4.917369e-14,1.585901e-13,-3.679138e-13,-2.198187e-13,8.704933e-13,-2.152338e-13,-1.161753e-12
4,1.195712e-02,3.539646e-03,-2.929524e-03,1.208564e-04,-2.918633e-03,1.920075e-03,2.847949e-04,-2.864683e-03,6.629736e-03,-6.703363e-04,2.212074e-02,-2.859169e-03,-8.963988e-03,-2.298765e-06,3.676231e-03,-8.869478e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19941,1.376592e-05,3.309791e-05,8.181534e-07,6.956614e-07,-1.246497e-06,5.584086e-06,1.634091e-06,2.398185e-06,-8.206191e-06,2.039173e-05,6.732890e-06,7.609823e-06,-1.298899e-05,4.124797e-09,4.911132e-06,7.667366e-04
19942,2.167089e-06,1.270051e-06,-9.398568e-07,1.958496e-07,-3.088602e-06,2.842897e-06,5.608788e-07,-6.609441e-06,2.070053e-05,1.467277e-04,2.160491e-05,4.657448e-06,2.418945e-06,5.970596e-10,1.020098e-07,-4.923463e-06
19943,8.228377e-06,3.218642e-05,-3.100119e-05,4.644238e-04,5.361458e-05,2.682901e-05,1.249916e-04,-3.908764e-04,-1.074649e-04,-7.051941e-06,-9.821619e-06,9.948227e-08,-2.013161e-05,-1.583021e-07,6.891074e-03,-2.318087e-06
19944,3.050092e-04,2.988058e-04,-4.683525e-06,2.487672e-05,-7.869918e-05,1.258718e-04,3.565852e-03,-4.000577e-05,1.465839e-04,3.468422e-04,4.647792e-04,-7.124028e-05,-4.680520e-04,7.479761e-07,8.053121e-05,1.483632e-03


In [10]:
df = group_stat(df,'user_id','interact_type')

In [11]:
df

Unnamed: 0,user_id,item_id,interact_type,date,user_id_interact_type_groupby_min,user_id_interact_type_groupby_max,user_id_interact_type_groupby_mean,user_id_interact_type_groupby_median,user_id_interact_type_groupby_std,user_id_interact_type_groupby_skew
0,10243056,22635954,1,213,1,1,1.000000,1.0,0.000000,0.000000
1,10203565,24723827,3,213,1,11,6.413333,3.0,3.520704,0.068759
2,10317559,20413036,2,213,1,4,1.981818,2.0,0.490310,1.908882
3,10158940,23833050,1,213,1,4,1.095541,1.0,0.528465,5.383825
4,10376271,22218154,1,213,1,4,1.013323,1.0,0.199555,14.921979
...,...,...,...,...,...,...,...,...,...,...
198603,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541
198604,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541
198605,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541
198606,10407802,20797334,7,181,7,10,9.891945,10.0,0.534364,-5.068541


In [12]:
df = cross_enc(df,'user_id','interact_type')

In [13]:
df

Unnamed: 0,user_id,item_id,interact_type,date,user_id_interact_type_groupby_min,user_id_interact_type_groupby_max,user_id_interact_type_groupby_mean,user_id_interact_type_groupby_median,user_id_interact_type_groupby_std,user_id_interact_type_groupby_skew,label_enc_user_id,user_id_count,label_enc_interact_type,interact_type_count,user_id_interact_type,label_enc_user_id_interact_type,user_id_interact_type_count,user_id_interact_type_count_div_user_id_count,user_id_interact_type_count_div_interact_type_count
0,10243056,22635954,1,213,1,1,1.000000,1.0,0.000000,0.000000,0,21,0,87379,10243056_1,0,21,0.064006,0.000141
1,10203565,24723827,3,213,1,11,6.413333,3.0,3.520704,0.068759,1,225,1,1906,10203565_3,1,113,0.212369,0.001780
2,10317559,20413036,2,213,1,4,1.981818,2.0,0.490310,1.908882,2,55,2,5142,10317559_2,2,48,0.132563,0.000719
3,10158940,23833050,1,213,1,4,1.095541,1.0,0.528465,5.383825,3,157,0,87379,10158940_1,3,152,0.327521,0.001020
4,10376271,22218154,1,213,1,4,1.013323,1.0,0.199555,14.921979,4,1351,0,87379,10376271_1,4,1345,0.811173,0.009029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198603,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541,2934,509,6,63958,10407802_10,3355,486,0.595521,0.003871
198604,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541,2934,509,6,63958,10407802_10,3355,486,0.595521,0.003871
198605,10407802,22901903,10,181,7,10,9.891945,10.0,0.534364,-5.068541,2934,509,6,63958,10407802_10,3355,486,0.595521,0.003871
198606,10407802,20797334,7,181,7,10,9.891945,10.0,0.534364,-5.068541,2934,509,5,18540,10407802_7,7916,16,0.019606,0.000200
