In [None]:
import pandas as pd
import numpy as np
train_view = pd.read_csv('/data/train_view.csv',encoding='gbk' )
testa_view = pd.read_csv('/data/testa_view.csv',encoding='gbk' )

In [None]:
train_view['is_train'] = 1
testa_view['is_train'] = 0
train_view = pd.concat([train_view,testa_view],axis=0)
print(train_view.shape)
train_view.head()

In [None]:
train_view = train_view.sort_values(['cust_wid','acs_tm'])
train_view.head()

In [None]:
train_view['date'] = train_view['acs_tm'].str.split().str.get(0)
train_view.head()

In [None]:
import datetime as dt
train_view['acs_tm'] = train_view['date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d') if type(x)==str else pd.NaT)

In [None]:
import pandas as pd

# 读取数据
df = train_view
df['page_id'] = df['page_id'].astype(str)
# 计算每个用户操作的日期列表，并按照日期顺序排序
date_series = df['acs_tm'].apply(lambda x: x.date())
operate_dates = df.groupby('cust_wid')['acs_tm'].apply(lambda x: sorted(set(x.apply(lambda y: y.date()))))

# 计算每个用户操作的page_id序列
operate_pages = df.groupby('cust_wid')['page_id'].apply(list)

# 计算每个用户登陆天数
num_login_days = operate_dates.apply(len)

# 计算每个用户最长连续操作天数
max_consecutive_days = {}
for cust_wid, dates in operate_dates.iteritems():
    max_consecutive_days[cust_wid] = 1
    consecutive_days = 1
    for i in range(1, len(dates)):
        if (dates[i] - dates[i-1]).days == 1:
            consecutive_days += 1
            max_consecutive_days[cust_wid] = max(max_consecutive_days[cust_wid], consecutive_days)
        else:
            consecutive_days = 1

# 输出结果
result = pd.DataFrame({'num_login_days': num_login_days,
                       'max_consecutive_days': pd.Series(max_consecutive_days),
                       'operate_pages': operate_pages})
print(result)

In [None]:
result.head()

In [None]:
result['totoal_operate'] = result['operate_pages'].apply(lambda x: len(x))
result['operate_nums'] = result['operate_pages'].apply(lambda x: len(set(x)))
result.head()

In [None]:
from collections import Counter
# 统计各个操作的出现次数
counter = Counter()
for op_pages in result['operate_pages']:
    counter.update(op_pages)

# 保存结果到DataFrame中
operate_pages_count = pd.DataFrame({'page_id': list(counter.keys()), 'count': list(counter.values())})

# 输出结果
print(operate_pages_count)
len(counter)

In [None]:
result = result.reset_index().rename(columns={'index': 'cust_wid'})
result.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = train_view
df['page_id'] =df['page_id'].astype(str)
# 将操作序列转化为字符串
df = df.groupby('cust_wid')['page_id'].apply(lambda x: ' '.join(x)).to_frame().reset_index()

# 获取TfidfVectorizer的embedding
vectorizer = TfidfVectorizer(max_features=20)
embedding = vectorizer.fit_transform(df['page_id']).toarray()

# 将embedding保存为DataFrame格式
embedding_df = pd.DataFrame(embedding, columns=[f"embedding_{i+1}" for i in range(20)])

In [None]:
# 将cust_wid作为索引并合并embedding_df和result
embedding_df.index = result['cust_wid']
embedding_df = embedding_df.reset_index()
embedding_df.to_csv('./all_view_emb.csv',index = 0)

In [None]:
embedding_df.head()

In [None]:
embedding_df.to_csv('./all_view_emb.csv',index = 0)

In [None]:
from gensim.models import Word2Vec
df = result
# 将page_id序列转化为一个句子，每个page_id作为一个单词
sentences = [list(df['operate_pages'].apply(lambda x: ' '.join([str(x_data) for x_data in x])))]
len(sentences[0])

In [None]:
sentences[0][0]

In [None]:
# 训练word2vec模型
model = Word2Vec(sentences=sentences[0], vector_size=20, window=5, min_count=1, workers=4)

In [None]:
# 获取所有用户的embedding表示
from tqdm import tqdm
user_embeddings = {}
index = 0
for cust_wid in tqdm(list(result['cust_wid'])):
    #print(cust_wid)
    user_sentences = sentences[0][index]
    #print(user_sentences)
    user_embedding = np.zeros((20,))
    for word in user_sentences:
        if word in model.wv.index_to_key:
            user_embedding += model.wv[word]
            
    user_embedding /= len(user_sentences)
    user_embeddings[cust_wid] = user_embedding
    index += 1

In [None]:
# 将结果保存为DataFrame格式
embedding_df = pd.DataFrame.from_dict(user_embeddings, orient='index', columns=[f'w2v_view_embedding_{i+1}' for i in range(20)])
embedding_df.index.name = 'cust_wid'

In [None]:
embedding_df.head()

In [None]:
embedding_df.reset_index().to_csv('all_view_w2v_emb.csv',index = 0)