In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
pd.set_option('display.max_columns', None)


def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

import pickle
path = 'D:\\ctr contest\\inter var\\before_feat_eng\\'

file1 = open(path+'df.pkl','rb')
df = pickle.load(file1)
file1.close()

file1 = open(path+'click_df.pkl','rb')
click_df = pickle.load(file1)
file1.close()

file1 = open(path+'sort_df.pkl','rb')
sort_df = pickle.load(file1)
file1.close()

file1 = open(path+'labels.pkl','rb')
labels = pickle.load(file1)
file1.close()

file1 = open(path+'train_num.pkl','rb')
train_num = pickle.load(file1)
file1.close()

In [None]:
t = time.time()

print('=============================================== feat eng ===============================================')

print('*************************** embedding ***************************')
# 之前有个朋友给embedding做了一个我认为非常形象的比喻：
# 在非诚勿扰上面，如果你想了解一个女嘉宾，那么你可以看看她都中意过哪些男嘉宾；
# 反过来也一样，如果你想认识一个男嘉宾，那么你也可以看看他都选过哪些女嘉宾。


def emb(df, f1, f2):
    emb_size = 8
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
   
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)   ##########3  diy
    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    tmp = reduce_mem(tmp)
    print('runtime:', time.time() - t)
    return tmp


emb_cols = [
    ['deviceid', 'newsid'],
    ['deviceid', 'lng_lat'],
    ['newsid', 'lng_lat'],
    # ...
]
for f1, f2 in emb_cols:
    df = df.merge(emb(sort_df, f1, f2), on=f1, how='left')
    df = df.merge(emb(sort_df, f2, f1), on=f2, how='left')
del sort_df
gc.collect()

In [None]:
print('***************************deep walk embedding ***************************')
import random
def deepwalk(df, f1, f2):
    L = 16
    #Deepwalk算法，
    print("deepwalk:",f1,f2)
    #构建图
    dic={}
    for item in df[[f1,f2]].values:
        try:
            str(int(item[1]))
            str(int(item[0]))
        except:
            continue
        try:
            dic['item_'+str(int(item[1]))].add('user_'+str(int(item[0])))
        except:
            dic['item_'+str(int(item[1]))]=set(['user_'+str(int(item[0]))])
        try:
            dic['user_'+str(int(item[0]))].add('item_'+str(int(item[1])))
        except:
            dic['user_'+str(int(item[0]))]=set(['item_'+str(int(item[1]))])
    dic_cont={}
    for key in dic:
        dic[key]=list(dic[key])
        dic_cont[key]=len(dic[key])
    print("creating")     
    #构建路径
    path_length=24
    sentences=[]
    length=[]
    for key in dic:
        sentence=[key]
        while len(sentence)!=path_length:
            key=dic[sentence[-1]][random.randint(0,dic_cont[sentence[-1]]-1)]
            if len(sentence)>=2 and key == sentence[-2]:
                break
            else:
                sentence.append(key)
        sentences.append(sentence)
        length.append(len(sentence))
        if len(sentences)%100000==0:
            print(len(sentences))
    print(np.mean(length))
    print(len(sentences))
    #训练Deepwalk模型
    print('training...')
    random.shuffle(sentences)
    model = Word2Vec(sentences, size=L, window=4,min_count=1,sg=1, workers=10,iter=20)
    print('outputing...')
    #输出
    values=set(df[f1].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['user_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df1=pd.DataFrame(w2v)
    names=[f1]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_embedding_'+str(L)+'_'+str(i))
    out_df1.columns = names
    print(out_df1.head())
    
    ########################
    values=set(df[f2].values)
    w2v=[]
    for v in values:
        try:
            a=[int(v)]
            a.extend(model['item_'+str(int(v))])
            w2v.append(a)
        except:
            pass
    out_df2=pd.DataFrame(w2v)
    names=[f2]
    for i in range(L):
        names.append(f1+'_'+ f2+'_'+names[0]+'_deepwalk_emb_'+str(L)+'_'+str(i))
    out_df2.columns = names
    print(out_df2.head())
    return (out_df1, out_df2)


emb_cols = [
    ['deviceid', 'newsid'],
#    ['lng_lat', 'newsid'],
]
for f1, f2 in emb_cols:
    out_df1, out_df2 = deepwalk(sort_df, f1, f2)
    df = df.merge(out_df1, on=f1, how='left')
    del out_df1, out_df2
    gc.collect()

In [None]:
import re
user = pd.read_csv('E:\\contest\\ctr\\user.csv')
countrow = 0
userlist = []
itemlist = []
ratelist = []
for item in user[user['tag']==user['tag']][['deviceid', 'tag']].values:
    ta = item[1]
    countrow +=1
    if countrow% 10000==0:print(countrow)

    try:
        tasp = [[re.split(':|_',tt)[0], float(re.split(':|_',tt)[2])] for tt in  ta.split('|') if len(tt.split(':'))>1]

        for tas in tasp:
            userlist.append(item[0])
            itemlist.append(tas[0])

            ratelist.append(tas[1])
    except:
        print(ta)
        
train = pd.DataFrame(zip(userlist, itemlist, ratelist), columns=['userID', 'itemsID', 'rating'])
train = train.sample(frac=1.0)
train = train.reset_index(drop=True)
train = train[train['rating']<4.57]

import surprise

train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader(rating_scale=(1, 4.5))).build_full_trainset()
svd = surprise.SVD(random_state=0, n_factors=20, n_epochs=300, verbose=True, lr_all=0.0005)
start_time = time.time()
svd.fit(train_set)
train_time = time.time() - start_time

user_id_uni = train['userID'].unique()
w2v=[]
for us, svdfea in zip(user_id_uni, svd.pu):
    a = [us]
    a.extend(list(svdfea))
    w2v.append(a)
    
out_df = pd.DataFrame(w2v, columns=['deviceid']+['tag_scd_'+str(i)+ '_feature' for i in range(20)])

import pickle
file1 = open('D:\\ctr contest\\inter var\\map_dict\\deviceid_map2.pkl', 'rb')
did_map_dict = pickle.load(file1)
file1.close()

out_df['deviceid'] = out_df['deviceid'].apply(lambda x: did_map_dict[x])
df = df.merge(out_df, on='deviceid', how='left')

In [None]:

        
out = 'D:\\ctr contest\\inter var\\features\\'
file1 = open(out+'embedding_feature.pkl','wb')
pickle.dump(df[df.columns.to_list()[26:]], file1, protocol = 4)
file1.close()
