In [2]:
import pandas as pd
import numpy as np
import pickle

users_df = pd.read_csv('./data/users.csv')
course_df = pd.read_csv('./data/courses.csv')
train_df = pd.read_csv('./data/train.csv')
val_seen_df = pd.read_csv('./data/val_seen.csv')
test_seen_df = pd.read_csv('./data/test_seen.csv')
val_unseen_df = pd.read_csv('./data/val_unseen.csv')
test_unseen_df = pd.read_csv('./data/test_unseen.csv')

# User Tokenize

## 清理資料和選取features

In [None]:
users_course_df = pd.merge(users_df, train_df, how="outer")

id2course_mapping = course_df["course_id"].to_dict()
course2id_mapping = {v : k for k, v in id2course_mapping.items()}
print(len(course2id_mapping))

id2user_mapping = users_course_df["user_id"].to_dict()
user2id_mapping = {v : k for k, v in id2user_mapping.items()}

In [None]:
def clean_data(x):
    x = x.replace("其他", "")
    x = x.replace("other", "")
    return x

filledna_df = users_course_df.fillna('')
features = ['gender', 'occupation_titles', 'interests', 'recreation_names', 'course_id']
filledna_df = filledna_df[features]
for feature in features:
    filledna_df[feature] = filledna_df[feature].apply(clean_data)

In [6]:
def create_soup(x):
    return x['gender']+ ' ' + x['occupation_titles'] + ' ' + x['interests'] + ' ' + x['recreation_names']
def create_soup_with_courseID(x):
    return x['gender']+ ' ' + x['occupation_titles'] + ' ' + x['interests'] + ' ' + x['recreation_names'] + " " + x['course_id']

#filledna_df['soup'] = filledna_df.apply(create_soup, axis=1)
filledna_df['soup_with_courseID'] = filledna_df.apply(create_soup_with_courseID, axis=1) # 加入課程ID資訊，隱含user-item的關係

## 將user文本資料轉向量並計算users間的相似度

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_add_course = TfidfVectorizer()
tfidf_add_course_matrix = tfidf.fit_transform(filledna_df['soup_with_courseID'])
cosine_sim = cosine_similarity(tfidf_add_course_matrix, tfidf_add_course_matrix)

In [13]:
tfidf_save = {"TfidfVectorizer": tfidf_add_course, "Tfidf_matrix": tfidf_add_course_matrix, "Cos_sim":cosine_sim}
with open('user_add_course_embedding.pickle', 'wb') as f:
    pickle.dump(tfidf_save, f)

# 提前算出和user最相似且買過課程的300個users和對應權重

In [None]:
user_haved_purchased_course = {}

train_user_haved_purchased_course = {}
for seen_user_id, course_ids in zip(train_df["user_id"], train_df["course_id"]):
    course_id_list = course_ids.split(' ')
    train_user_haved_purchased_course[seen_user_id] = train_user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list
    user_haved_purchased_course[seen_user_id] = user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list

val_seen_user_haved_purchased_course = {}
for seen_user_id, course_ids in zip(val_seen_df["user_id"], val_seen_df["course_id"]):
    course_id_list = course_ids.split(' ')
    val_seen_user_haved_purchased_course[seen_user_id] = val_seen_user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list
    user_haved_purchased_course[seen_user_id] = user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list
    
val_unseen_user_haved_purchased_course = {}
for unseen_user_id, course_ids in zip(val_unseen_df["user_id"], val_unseen_df["course_id"]):
    course_id_list = course_ids.split(' ')
    val_unseen_user_haved_purchased_course[unseen_user_id] = val_unseen_user_haved_purchased_course.setdefault(unseen_user_id, []) + course_id_list
    user_haved_purchased_course[unseen_user_id] = user_haved_purchased_course.setdefault(unseen_user_id, []) + course_id_list

haved_purchased_users_list = []
for user_id in train_user_haved_purchased_course.keys():
    if len(train_user_haved_purchased_course[user_id]) >0:
        haved_purchased_users_list.append(user_id)
for user_id in val_seen_user_haved_purchased_course.keys():
    if len(val_seen_user_haved_purchased_course[user_id]) >0:
        haved_purchased_users_list.append(user_id)
for user_id in val_unseen_user_haved_purchased_course.keys():
    if len(val_unseen_user_haved_purchased_course[user_id]) >0:
        haved_purchased_users_list.append(user_id)
haved_purchased_users_list = list(set(haved_purchased_users_list))    

print(len(train_user_haved_purchased_course))
print(len(val_seen_user_haved_purchased_course))
print(len(val_unseen_user_haved_purchased_course))
print(len(haved_purchased_users_list))
print(len(user_haved_purchased_course))

In [12]:
with open('user_add_course_embedding.pickle', 'rb') as f:
    user_add_course_embedding =pickle.load(f)

## 取得前top個相似且買過課程的users

In [154]:
def get_recommend_users(user_id, cosine_sim, top = 10):
    user_index = user2id_mapping[user_id]

    sim_scores = list(enumerate(cosine_sim[user_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommend_indices = []
    recommend_indices_weights = []
    for i in range(len(sim_scores)):
        if len(recommend_indices) < top:
            if id2user_mapping[sim_scores[i][0]] not in haved_purchased_users_list: 
                continue
            if sim_scores[i][0] not in [user_index]:
                recommend_indices.append(sim_scores[i][0])
                recommend_indices_weights.append(sim_scores[i][1])
        else:
            break

    return recommend_indices, recommend_indices_weights

## 對Seen中的users計算並儲存

In [None]:
predict_users = test_seen_df["user_id"].to_list()
test_add_course_similiar_haved_purchased_users_300 = {}
test_add_course_similiar_haved_purchased_300_weights = {}
print(len(predict_users))
for user_id in predict_users:
    recommend_indices, recommend_indices_weights = get_recommend_users(user_id, user_add_course_embedding['Cos_sim'], 300)
    test_add_course_similiar_haved_purchased_users_300.update({user_id: users_df['user_id'].iloc[recommend_indices].tolist()})
    test_add_course_similiar_haved_purchased_300_weights.update({user_id: recommend_indices_weights})

In [None]:
with open('similar_users/seen_user_similar_add_course.pickle', 'wb') as f:
    pickle.dump({"similar_users":test_add_course_similiar_haved_purchased_users_300, "similar_user_weights":test_add_course_similiar_haved_purchased_300_weights}, f)

## 對Unseen中的users計算並儲存

In [None]:
predict_users = test_unseen_df["user_id"].to_list()
test_add_course_similiar_haved_purchased_users_300 = {}
test_add_course_similiar_haved_purchased_300_weights = {}
print(len(predict_users))
for user_id in predict_users:
    recommend_indices, recommend_indices_weights = get_recommend_users(user_id, user_add_course_embedding['Cos_sim'], 300)
    test_add_course_similiar_haved_purchased_users_300.update({user_id: users_df['user_id'].iloc[recommend_indices].tolist()})
    test_add_course_similiar_haved_purchased_300_weights.update({user_id: recommend_indices_weights})

In [None]:
with open('similar_users/unseen_user_similar_add_course.pickle', 'wb') as f:
    pickle.dump({"similar_users":test_add_course_similiar_haved_purchased_users_300, "similar_user_weights":test_add_course_similiar_haved_purchased_300_weights}, f)