In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import fasttext
from collections import Counter


np.random.seed(0)
random.seed(0)

In [2]:
# load user video act
json_file = "additional_information/user_video_act.json"

with open(json_file, "r") as f:
    lines = f.readlines()

# load course name dict
with open("entities/course.json", "r", encoding="utf8") as f:
    courses = f.readlines()
    
course_dict = dict()
for c in courses:
    json_data = json.loads(c)
    course_dict[json_data["id"]] = json_data["name"]

# load course-teacher relationship to dict
def get_course_dict(filepath):
    with open(filepath, "r", encoding="utf8") as f:
        tc_rels = f.readlines()
    tc_rel_dict = dict()
    for tc_rel in tc_rels:
        tc_rel = tc_rel.strip().split("\t")
        if tc_rel[1] not in tc_rel_dict:
            tc_rel_dict[tc_rel[1]] = [tc_rel[0]]
        else:
            tc_rel_dict[tc_rel[1]].append(tc_rel[0])
        
    return tc_rel_dict

tc_rel_dict = get_course_dict("relations/teacher-course.json")
print("total courses", len(tc_rel_dict))

sc_rel_dict = get_course_dict("relations/school-course.json")
print("total courses", len(sc_rel_dict))

# check if each course has more than one schools (no)
# for key in sc_rel_dict:
#     if len(sc_rel_dict[key]) > 1:
#         print(sc_rel_dict[key])

# load video concepts
vc_df = pd.read_csv("relations/video-concept.json",
                   header=None,
                   delimiter="\t",
                   names=["video", "concept"])
vc_dict = vc_df.groupby('video')['concept'].apply(list).to_dict()

# load course concepts
cc_df = pd.read_csv("relations/course-concept.json",
                   header=None,
                   delimiter="\t",
                   names=["course", "concept"])
cc_dict = cc_df.groupby('course')['concept'].apply(list).to_dict()

# load course video
cv_df = pd.read_csv("relations/course-video.json",
                   header=None,
                   delimiter="\t",
                   names=["course", "video"])
cv_dict = cv_df.groupby("course")["video"].apply(list).to_dict()

total courses 697
total courses 705


In [3]:
cc_dict["C_course-v1:ACCA+FA1_X+2019_T1"]

['K_订单_管理科学技术',
 'K_会计_管理科学技术',
 'K_贷款_管理科学技术',
 'K_模拟_心理学',
 'K_资源_管理科学技术',
 'K_会计等式_管理科学技术',
 'K_信息储存_管理科学技术',
 'K_知识_管理科学技术',
 'K_复式记账法_管理科学技术',
 'K_应收账款_管理科学技术',
 'K_道路_数学',
 'K_组织_管理科学技术',
 'K_预算_管理科学技术',
 'K_反馈_管理科学技术',
 'K_期限_管理科学技术',
 'K_使用_管理科学技术',
 'K_匹配_数学',
 'K_广告_管理科学技术',
 'K_总分类账_管理科学技术',
 'K_价值_管理科学技术',
 'K_计数值_管理科学技术',
 'K_所有者权益_管理科学技术',
 'K_养老基金_管理科学技术',
 'K_平衡_管理科学技术',
 'K_组织设计_管理科学技术',
 'K_商品_管理科学技术',
 'K_误差_数学',
 'K_等待_管理科学技术',
 'K_工作环境_管理科学技术',
 'K_通分_数学',
 'K_分析_心理学',
 'K_客户_管理科学技术',
 'K_练习_心理学',
 'K_慈善组织_管理科学技术',
 'K_计划_管理科学技术',
 'K_收入_管理科学技术',
 'K_索引_管理科学技术',
 'K_费用_管理科学技术',
 'K_需要确认_管理科学技术',
 'K_企业家_管理科学技术',
 'K_学会_管理科学技术',
 'K_空间_数学',
 'K_公司_管理科学技术',
 'K_学习_管理科学技术',
 'K_分类账簿_管理科学技术',
 'K_纠正_管理科学技术',
 'K_解释_数学',
 'K_政策_管理科学技术',
 'K_注意_心理学',
 'K_控制账目_管理科学技术',
 'K_检查_管理科学技术',
 'K_交换_管理科学技术',
 'K_账户_管理科学技术',
 'K_投资_管理科学技术',
 'K_价格_管理科学技术',
 'K_计算方法_数学',
 'K_福利_管理科学技术',
 'K_雇主_管理科学技术',
 'K_调整_心理学',
 'K_过账_管理科学技术',
 'K_假设_数学',
 'K_职位_管理科学技术',
 'K_应付账款_管理科学技术',
 'K_收

In [4]:
print("total users", len(lines))
user_dfs = list()

for l in tqdm(lines):
    json_data = json.loads(l)
#     print(json_data)

    activities = json_data["activity"]
    course_ids = list()
    video_ids = list()
    local_start_times = list()
    concepts = list()
    cnames = list()
    teachers = list()
    schools = list()
    
    for a in activities:
        cnames.append(course_dict[a["course_id"]]) if a["course_id"] in course_dict else cnames.append("")
        course_ids.append(a["course_id"])
        video_ids.append(a["video_id"])
        local_start_times.append(a["local_start_time"])
        concepts.append(vc_dict[a["video_id"]]) if a["video_id"] in vc_dict else concepts.append([""])
        teachers.append(tc_rel_dict[a["course_id"]]) if a["course_id"] in tc_rel_dict else teachers.append([""])
        schools.append(sc_rel_dict[a["course_id"]][0]) if a["course_id"] in sc_rel_dict else schools.append("")
        
    df = pd.DataFrame({
        "id": json_data["id"],
        "cname": cnames,
        "cid": course_ids,
        "vid": video_ids,
        "concepts": concepts,
        "teachers": teachers,
        "schools": schools,
        "local_start_time": local_start_times
    })
    df.sort_values("local_start_time", inplace=True)
    
#     display(df)
    user_dfs.append(df)

# convert to total df
total_df = pd.concat(user_dfs,ignore_index=True)
total_df.shape

  0%|          | 23/48640 [00:00<03:39, 221.00it/s]

total users 48640


100%|██████████| 48640/48640 [02:57<00:00, 273.77it/s]


(4874298, 8)

In [5]:
# max '2020-04-17 17:25:48'
bad_formatted_users = list(total_df[total_df["local_start_time"]<"2017-01-01 00:00:00"]['id'].unique())
user_in_trainingperiod = list(total_df[(total_df["local_start_time"]>='2017-01-01 00:00:00') & (total_df["local_start_time"]<='2019-11-01 00:00:00')]['id'].unique())
user_in_testperiod = list(total_df[total_df["local_start_time"]>'2019-11-01 00:00:00']['id'].unique())
selected_users = [u for u in user_in_testperiod if u in user_in_trainingperiod if u not in bad_formatted_users]
print(len(selected_users))

filtered_df = total_df[total_df["id"].isin(selected_users)]
print(filtered_df.shape)

# check whether there is at least one concept in the testing period that is not in the training period
need_to_filter = list()
for u in selected_users:
    udf = filtered_df[filtered_df["id"]==u]
    train_udf = udf[(udf["local_start_time"]>='2017-01-01 00:00:00') & (udf["local_start_time"]<='2019-11-01 00:00:00')]
    test_udf = udf[udf["local_start_time"]>'2019-11-01 00:00:00']
    train_concepts = list(set([x for sublist in train_udf["concepts"].values for x in sublist]))
    test_concepts = list(set([x for sublist in test_udf["concepts"].values for x in sublist]))
    if len([x for x in test_concepts if x not in train_concepts])==0:
#         print("unsatisfied user:", u)
        need_to_filter.append(u)

filtered_users = [u for u in selected_users if u not in need_to_filter]
filtered_df = filtered_df[filtered_df["id"].isin(filtered_users)]

2130
(311743, 8)


In [6]:
print("---------------- Entity Statistics -----------------")
distinct_concepts = list(set([c for sublist in filtered_df["concepts"].values for c in sublist if len(c)>0]))

print("filtered df", filtered_df.shape)
print("total users", len(filtered_df["id"].unique()))
print("total courses", len(filtered_df["cid"].unique()))
print("total videos", len(filtered_df["vid"].unique()))
print("total schools", len(filtered_df["schools"].unique()))
print("distinct concepts", len(distinct_concepts))
t_list = filtered_df["teachers"].values
flattend_t_list = list(set([t for sublist in t_list for t in sublist]))
print("total teachers", len(flattend_t_list))

######################
# relations (wrong before: not only limited to relations in the filtered but from relation.json files)
# print("user-course relations", filtered_df.groupby(['id','cid']).size().shape[0])
# print("course-video relations", filtered_df.groupby(['cid','vid']).size().shape[0])
# ct_list = [[filtered_df["cid"].values[i]+x for x in v] for i,v in enumerate(filtered_df["teachers"].values)]
# flattend_ct_list = list(set([ct for sublist in ct_list for ct in sublist]))
# print("teacher-course relations", len(flattend_ct_list))
# vcon_list = [[filtered_df["vid"].values[i]+x for x in v] for i,v in enumerate(filtered_df["concepts"].values)]
# flattend_vcon_list = list(set([ct for sublist in vcon_list for ct in sublist]))
# print("video-concept relations", len(flattend_vcon_list))

---------------- Entity Statistics -----------------
filtered df (290024, 8)
total users 2005
total courses 600
total videos 22403
total schools 137
distinct concepts 21037
total teachers 1385


In [7]:
# store distinct concept for embedding analysis
with open("concept21037.txt", "w") as f:
    for c in distinct_concepts:
        f.write("{}\n".format(c))

In [36]:
print("---------------- Relation Statistics based on filtered courses -----------------")
print("user-course relations", filtered_df.groupby(['id','cid']).size().shape[0])

#######################
# relations based on filtered course 
courses = filtered_df["cid"].unique()
# course video
filtered_cv_dict = dict(filter(lambda elem: elem[0] in courses, cv_dict.items()))
print("course-video relations", sum([len(x) for x in filtered_cv_dict.values()]))
# teacher course
filtered_tc_rel_dict = dict(filter(lambda elem: elem[0] in courses, tc_rel_dict.items()))
print("teacher-course relations", sum([len(x) for x in filtered_tc_rel_dict.values()]))
# video concept
videos_in_filtered_courses = list(set([x for sublist in filtered_cv_dict.values() for x in sublist]))
print(len(videos_in_filtered_courses))
filtered_vc_rel_dict = dict(filter(lambda elem: elem[0] in videos_in_filtered_courses, vc_dict.items()))
print("video-concept relations", sum([len(x) for x in filtered_vc_rel_dict.values()]))
# course concept
filtered_cc_rel_dict = dict(filter(lambda elem: elem[0] in courses, cc_dict.items()))
print("course-concept relations", sum([len(x) for x in filtered_cc_rel_dict.values()]))

# concepts = list(set([x for sublist in filtered_cc_rel_dict.values() for x in sublist]))
# print("total concepts in filtered courses", len(concepts))

# store KC.p
concepts = distinct_concepts
course2index = dict(zip(courses, list(range(len(courses)))))
concept2index = dict(zip(concepts, list(range(len(concepts)))))

cc_list = list()
for c in courses:
    cvec = np.zeros(len(concepts))
    if c in filtered_cc_rel_dict:
        indices = [concept2index[_] for _ in filtered_cc_rel_dict[c] if _ in concept2index]
        cvec[indices] = 1
    else:
        print("course - {} does not have concepts".format(c))
    cc_list.append(cvec)
    
cc_np = np.array(cc_list).T
cc_np.shape

---------------- Relation Statistics based on filtered courses -----------------
user-course relations 13696
course-video relations 42117
teacher-course relations 1875
34506
video-concept relations 295475
course-concept relations 150811
course - C_course-v1:RiceX+Phys102x+sp does not have concepts
course - C_course-v1:TsinghuaX+Thesis2018+sp does not have concepts
course - C_course-v1:UC_BerkeleyX+ColWri2_1x_2015_T1+2019_T1 does not have concepts
course - C_course-v1:HUBU+2017022703X+sp does not have concepts
course - C_course-v1:TsinghuaX+TsinghuaMandarin01+sp does not have concepts
course - C_course-v1:UC_BerkeleyX+CS169_2x+sp does not have concepts
course - C_course-v1:ZAFU+20171218+2019_T1 does not have concepts
course - C_course-v1:qhnu+20181212x+2019_T1 does not have concepts
course - C_course-v1:SDSNAssociation+C21001+sp does not have concepts
course - C_course-v1:XJTU+20171025001+2019_T1 does not have concepts
course - C_course-v1:NUDT+05028103+2018_T2 does not have concepts
co

(21037, 600)

In [33]:
# creat user-index, concept-index dict
users = filtered_df["id"].unique()
user2index = dict(zip(users, list(range(len(users)))))
concepts = distinct_concepts
concept2index = dict(zip(concepts, list(range(len(concepts)))))
courses = filtered_df["cid"].unique()
course2index = dict(zip(courses, list(range(len(courses)))))
teachers = flattend_t_list
teacher2index = dict(zip(teachers, list(range(len(teachers)))))
schools = filtered_df["schools"].unique()
school2index = dict(zip(schools, list(range(len(schools)))))
videos = filtered_df["vid"].unique()
video2index = dict(zip(videos, list(range(len(videos)))))


user_action = list()
rating_matrix = list()
adjacency_matrix = list()
user_course = list()
user_teacher = list()
user_school = list()
user_video = list()


def get_ua(udf, column, index_dict):
    """ 
    Get user-concept vector based on the given user dataframe 
    """
#     print(type(udf[column].values[0]))
    # below should be change to uceoncepts = dict(concept, clicked count)
    if isinstance(udf[column].values[0], list):
        uconcepts = list([c for sublist in udf[column].values for c in sublist if len(c)>0])
    else:
#         print(udf[column].values)
        uconcepts = list(c for c in udf[column].values if len(c)>0)
    uconcepts = Counter(uconcepts)
    uconcepts = dict(uconcepts)
#     print(len(uconcepts))
    
    uconcepts_indices = [index_dict[c] for c in uconcepts.keys()]
    uvec = np.zeros(len(index_dict))
#     uvec[uconcepts_indices] = 1
    np.put(a=uvec, ind=uconcepts_indices, v=list(uconcepts.values()))
    assert len(uconcepts_indices) == (uvec>0).sum()
    return uvec


def radom_negative_sample(user_action, item_size):
    """to get (user_size, 100, 2), 100th item is positive one"""
    negative_sample = []
    for u in user_action:
        sample = []
        i = 0
        while i < 99:
            t = random.randint(0, item_size-1)
            if t not in user_action[u]:
                sample.append([u, t])
                i += 1
        sample.append([u, user_action[u][-1]])
        negative_sample.append(sample)
    return np.array(negative_sample)


user_concept_dict = dict()
# i = 7
for i in range(len(users)):
    print(i, users[i])

    # user_action
    udf = filtered_df[filtered_df.id==users[i]]
    user_action.append(get_ua(udf, 'concepts', concept2index))
    
    # adjacency_matrix
    udf_train = udf[(udf["local_start_time"]>='2017-01-01 00:00:00') & (udf["local_start_time"]<='2019-11-01 00:00:00')]
    uvec_train = get_ua(udf_train, 'concepts', concept2index)
    adjacency_matrix.append(uvec_train)

    # rating_matrix
    udf_train.sort_values(["local_start_time"], inplace=True)
    con_list = udf_train["concepts"].values
    learning_time = udf_train["local_start_time"].values

    # create condict ordered by concept learned time
    # first learned time for a concept as its time
    con_dict = dict()
    for ind, clist in enumerate(con_list):
        time = learning_time[ind]
        for c in clist:
            if c not in con_dict and len(c)>0:
                con_dict[c] = time
    # replace the last item count as 0 as the paper described
#     print("before", (uvec_train>0).sum())
    uvec_train_ = uvec_train.copy()
    uvec_train_[concept2index[list(con_dict.keys())[-1]]] = 0
#     print("after", (uvec_train_>0).sum())
    rating_matrix.append(uvec_train_)

    user_concept_dict[i] = [concept2index[c] for c in list(con_dict.keys())]

    # user course
    user_course.append(get_ua(udf_train, 'cid', course2index))

    # user school
    user_school.append(get_ua(udf_train, 'schools', school2index))

    # user teacher
    user_teacher.append(get_ua(udf_train, 'teachers', teacher2index))
    
    # user video
    user_video.append(get_ua(udf_train, 'vid', video2index))
    
# binarize except those need to maintain numbers
adjacency_matrix = (np.array(adjacency_matrix)>0).astype(np.int8)
user_course = (np.array(user_course)>0).astype(np.int8)
user_teacher = (np.array(user_teacher)>0).astype(np.int8)
user_school = (np.array(user_school)>0).astype(np.int8)
user_video = (np.array(user_video)>0).astype(np.int8)

0 U_9044043
1 U_7078467
2 U_1049362
3 U_902
4 U_8520713
5 U_10748942
6 U_1087
7 U_1112
8 U_9700483
9 U_6816904
10 U_8258710


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


11 U_6685870
12 U_2491705
13 U_9700831
14 U_6817254
15 U_7210631
16 U_8128165
17 U_1443689
18 U_8521607
19 U_7997323
20 U_8390552
21 U_7079859
22 U_6162373
23 U_1999
24 U_9570262
25 U_7080106
26 U_6817993
27 U_526622
28 U_7997847
29 U_7211469
30 U_6949381
31 U_10488412
32 U_7080668
33 U_8260321
34 U_8260334
35 U_8522532
36 U_3076
37 U_8653873
38 U_10488889
39 U_7081019
40 U_7998662
41 U_8523071
42 U_10227059
43 U_9440652
44 U_658844
45 U_10358199
46 U_8261091
47 U_7736996
48 U_1576865
49 U_6950933
50 U_790570
51 U_6295703
52 U_7737646
53 U_8524090
54 U_7213560
55 U_9835064
56 U_7475945
57 U_8393517
58 U_7738206
59 U_9573295
60 U_7738346
61 U_9704489
62 U_7738457
63 U_2233537
64 U_8262868
65 U_9573649
66 U_10491387
67 U_9049609
68 U_6690329
69 U_923366
70 U_5889
71 U_9050022
72 U_9443260
73 U_2234406
74 U_2496576
75 U_9050292
76 U_8657087
77 U_8001814
78 U_1972521
79 U_530846
80 U_6691233
81 U_10492510
82 U_7084743
83 U_10230484
84 U_8002279
85 U_7478331
86 U_2628704
87 U_6691940
88 U_9

602 U_7775166
603 U_10527848
604 U_2925783
605 U_10659101
606 U_6858023
607 U_9479624
608 U_6465013
609 U_42572
610 U_8693356
611 U_6596249
612 U_8431369
613 U_3188531
614 U_3188589
615 U_42873
616 U_11970467
617 U_9349249
618 U_9873612
619 U_9480410
620 U_2926850
621 U_174354
622 U_2664762
623 U_7121326
624 U_10529324
625 U_8170036
626 U_9743116
627 U_8694545
628 U_10529590
629 U_8563608
630 U_8039648
631 U_6597899
632 U_3190078
633 U_10923386
634 U_9743796
635 U_8564164
636 U_9612820
637 U_8171130
638 U_4632368
639 U_10661683
640 U_9482194
641 U_9744338
642 U_9351154
643 U_8040502
644 U_2142347
645 U_10662117
646 U_3191111
647 U_8958324
648 U_9220499
649 U_45524
650 U_10531300
651 U_176670
652 U_9613872
653 U_8172111
654 U_8172176
655 U_176791
656 U_9614276
657 U_9483456
658 U_9745828
659 U_701874
660 U_8435188
661 U_46614
662 U_9483871
663 U_177794
664 U_2668169
665 U_10532597
666 U_8435535
667 U_440145
668 U_2930527
669 U_8173485
670 U_10663900
671 U_3061748
672 U_6731860
673 U_712

1180 U_7157037
1181 U_2831733
1182 U_7026042
1183 U_8730169
1184 U_7026263
1185 U_7419488
1186 U_5191407
1187 U_8730445
1188 U_7026528
1189 U_11745166
1190 U_7419926
1191 U_7420043
1192 U_7157955
1193 U_7420125
1194 U_8730963
1195 U_11483488
1196 U_10041727
1197 U_11614697
1198 U_7944768
1199 U_7420502
1200 U_9517656
1201 U_8731279
1202 U_8731303
1203 U_5323476
1204 U_8207103
1205 U_10435341
1206 U_10042137
1207 U_7027496
1208 U_11615050
1209 U_7420893
1210 U_8731643
1211 U_11353130
1212 U_6241394
1213 U_81026
1214 U_7945384
1215 U_7945447
1216 U_343395
1217 U_6634872
1218 U_7421317
1219 U_9518586
1220 U_8207910
1221 U_6766122
1222 U_9911868
1223 U_9518687
1224 U_6897536
1225 U_4276351
1226 U_10436930
1227 U_2965887
1228 U_9257354
1229 U_8208845
1230 U_10306066
1231 U_9257540
1232 U_7029366
1233 U_8995447
1234 U_7029413
1235 U_8733453
1236 U_8078172
1237 U_11486064
1238 U_10568584
1239 U_7029753
1240 U_8733708
1241 U_214149
1242 U_214163
1243 U_11486382
1244 U_8078601
1245 U_738693
124

1725 U_637205
1726 U_2734449
1727 U_8501627
1728 U_7846343
1729 U_2734556
1730 U_8370667
1731 U_10467877
1732 U_8370766
1733 U_3127897
1734 U_10730147
1735 U_7584448
1736 U_7322355
1737 U_768777
1738 U_5356330
1739 U_244579
1740 U_10992517
1741 U_3259297
1742 U_506786
1743 U_6929429
1744 U_7584823
1745 U_9288795
1746 U_7978106
1747 U_7060618
1748 U_4570354
1749 U_11517170
1750 U_10599685
1751 U_4701537
1752 U_376197
1753 U_9551247
1754 U_245268
1755 U_9551395
1756 U_6667886
1757 U_10075786
1758 U_8634002
1759 U_8634060
1760 U_8109816
1761 U_245543
1762 U_6668079
1763 U_8372040
1764 U_9551714
1765 U_7978851
1766 U_10338201
1767 U_9420719
1768 U_7585723
1769 U_10862535
1770 U_9682914
1771 U_10600422
1772 U_9683000
1773 U_2736203
1774 U_6930590
1775 U_5882159
1776 U_4440406
1777 U_4571498
1778 U_4440505
1779 U_9683413
1780 U_4440677
1781 U_4440694
1782 U_6931185
1783 U_4440859
1784 U_4440886
1785 U_9552732
1786 U_4440941
1787 U_10601345
1788 U_7193607
1789 U_3130584
1790 U_9422051
1791 U_

In [17]:
# construct negatives for the last item in each user's training set
negatives = radom_negative_sample(user_concept_dict, len(concept2index))
negatives.shape

(2005, 100, 2)

In [18]:
(np.array(rating_matrix)>0).sum(), (np.array(adjacency_matrix)>0).sum()

(856067, 858072)

In [19]:
(rating_matrix[0]>0).sum(), adjacency_matrix[0].sum()

(386, 387)

In [None]:
# check shape
print("user_action", np.array(user_action).shape)
print("rating_matrix", np.array(rating_matrix).shape)
print("adjacency_matrix", np.array(adjacency_matrix).shape)
print("UC", np.array(user_course).shape)
print("UCT", np.array(user_teacher).shape)
print("US", np.array(user_school).shape)
print("UV", np.array(user_video).shape)
print("negatives", negatives.shape)

In [37]:
with open('data-for-kgcrec/KC.p', 'wb') as f:
    pickle.dump(np.asmatrix(cc_np), f)

In [None]:
# map the files from paper repository
with open('data-for-kgcrec/user_action.p', 'wb') as f:
    pickle.dump(np.asmatrix(user_action), f)
with open('data-for-kgcrec/rate_matrix.p', 'wb') as f:
    pickle.dump(np.asmatrix(rating_matrix), f)
with open('data-for-kgcrec/adjacency_matrix.p', 'wb') as f:
    pickle.dump(np.asmatrix(adjacency_matrix), f)
with open('data-for-kgcrec/UC.p', 'wb') as f:
    pickle.dump(np.asmatrix(user_course), f)
with open('data-for-kgcrec/UCT.p', 'wb') as f:
    pickle.dump(np.asmatrix(user_teacher), f)
with open('data-for-kgcrec/US.p', 'wb') as f:
    pickle.dump(np.asmatrix(user_school), f)
with open('data-for-kgcrec/UV.p', 'wb') as f:
    pickle.dump(np.asmatrix(user_video), f)
with open('data-for-kgcrec/negative.p', 'wb') as f:
    pickle.dump(negatives, f)

In [None]:
# load embeddings from fasttext (not used at the moment)
model = fasttext.load_model("data-for-kgcrec/cc.zh.100.bin")
con_vectors = list()
for c in distinct_concepts:
    con_vectors.append(model.get_word_vector(c))

with open('data-for-kgcrec/concept_embedding.p', 'wb') as f:
    pickle.dump(np.array(con_vectors), f)

In [None]:
# training using concept descriptions
with open("entities/concept.json", "r", encoding="utf8") as f:
    lines = f.readlines()
    
docs = list()
for l in lines:
    json_str = json.loads(l)
    docs.append(json_str["name"])
    if 'explanation' in json_str:
        docs.append(json_str["explanation"])

In [None]:
len(docs)

In [None]:
import jieba
import gensim

z = [list(jieba.cut(i)) for i in docs]
model = gensim.models.FastText(z, 
                               sg=0, # CBOW
                               min_n=5,
                               max_n=10,
                               size=100, 
                               window=5,
                               negative=10,
                               min_count=1)

In [None]:
# print(len(model.wv.vocab.keys()), model.wv.vectors.shape)
# with open("vocab", "w", encoding="utf8") as f:
#     for w in model.wv.vocab.keys():
#         print(w)
#         f.write(w+"\n")

import csv
with open("vocab", 'w', encoding="utf8") as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, delimiter="\n")
    lines = [l.replace("\n"," ") for l in list(model.wv.vocab.keys())]
    wr.writerow(lines)
np.savetxt("emb.tsv", model.wv.vectors, delimiter="\t")

In [None]:
len(list(model.wv.vocab.keys()))