# 課程 Vectorization

In [2]:
import pandas as pd
import pickle

users_df = pd.read_csv('./data/users.csv')
course_df = pd.read_csv('./data/courses.csv')   # course_df 課程資料包含:course_id, course_name, course_price, teacher_id, teacher_intro, groups, sub_groups, topics, course_published_at_local, description, will_learn, required_tools, recommended_background, target_group 等項目
chapter_items_seq = pd.read_csv('./data/course_chapter_items_sequence.csv') # 課程各章節的詳細資訊

## 清理資料和選取Features

In [None]:
id2course_mapping = course_df["course_id"].to_dict()
course2id_mapping = {v : k for k, v in id2course_mapping.items()}
len(course2id_mapping)

id2user_mapping = users_df["user_id"].to_dict()
user2id_mapping = {v : k for k, v in id2user_mapping.items()}
len(id2user_mapping)

In [None]:
course_chapters = {}
for course_id, chapter_item_name_seq in zip(chapter_items_seq["course_id"], chapter_items_seq["chapter_item_name_seq"]):
    chapter_item_name_seq = chapter_item_name_seq.replace('[', "")
    chapter_item_name_seq = chapter_item_name_seq.replace(']', "")
    chapter_item_name_seq = chapter_item_name_seq.replace('\'', "")
    chapter_item_name_seq = chapter_item_name_seq.replace(' ', "")
    print(chapter_item_name_seq.split(','))
    course_chapters[course_id] = course_chapters.setdefault(course_id, []) + chapter_item_name_seq.split(',')

In [14]:
def get_course_chapters(x):
    if x["course_id"] in course_chapters.keys():
        return str(course_chapters[x["course_id"]])
    else:
        return ""

course_df['chapters'] = course_df.apply(get_course_chapters, axis=1)

In [None]:
def clean_data(x):
    if isinstance(x, int):
        return x
    return x.replace(" ", "")

filledna_df=course_df.fillna('')
# 選取想使用的features
features= ['course_name', 'teacher_intro', 'groups', 'sub_groups', 'topics', 'will_learn', 'recommended_background', 'target_group', 'required_tools', 'chapters', 'course_price']
filledna_df=filledna_df[features]
for feature in features:
    filledna_df[feature] = filledna_df[feature].apply(clean_data)


## 將課程的文本資料做分詞

In [None]:
import jieba.analyse

def create_soup(x):
    course_name = str(jieba.analyse.extract_tags(x['course_name'], 10))
    #teacher_intro = str(jieba.analyse.extract_tags(x['teacher_intro'], 10))
    #will_learn = str(jieba.analyse.extract_tags(x['will_learn'], 10))
    #recommended_background = str(jieba.analyse.extract_tags(x['recommended_background'], 10))
    target_group = str(jieba.analyse.extract_tags(x['target_group'], 10))
    #required_tools = str(jieba.analyse.extract_tags(x['required_tools'], 10))
    chapters = str(jieba.analyse.extract_tags(x['chapters'], 20))
    # course_name = x['course_name']
    teacher_intro = x['teacher_intro']
    will_learn = x['will_learn']
    recommended_background = x['recommended_background']
    target_group = x['target_group']
    required_tools = x['required_tools']
    #chapters = x['chapters']
    money = ''
    if x['course_price'] == 0:
        money += '本課程完全免費 '
    if x['course_price'] <= 1000:
        money += '本課程比較便宜 '
    elif x['course_price'] > 2000:
        money += '本課程比較昂貴 '
    text = money + ' ' + course_name + ' ' + teacher_intro + ' ' + x['groups'] + ' ' + x['sub_groups'] + ' ' + x['topics'] + ' ' + will_learn + ' ' + recommended_background + ' ' + target_group  + ' ' + chapters
    return text

filledna_df['soup'] = filledna_df.apply(create_soup, axis=1)

## 以TF-IDF將文本轉向量和計算課程間的相似度

In [84]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words= None)

# Construct the required TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(filledna_df['soup'])

In [None]:
# Compute the cosine similarity matrix
print(tfidf_matrix.shape)
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Output the shape of tfidf_matrix
cosine_sim_matrix.shape

# 儲存結果

In [89]:
import pickle
tfidf_save = {"TfidfVectorizer": tfidf, "Tfidf_matrix": tfidf_matrix, "Cos_Sim": cosine_sim_matrix}
with open('content_embeding/pure_context_jieba_name_target_chapters_AddPrice5Level.pickle', 'wb') as f:
    pickle.dump(tfidf_save, f)