- Import Packages

In [2]:
import json
import jieba
import pandas
import operator
import csv

- Load Data

In [3]:
keyword_tfidf_dict = json.load(open('keyword_tfidf_dict.json','r'))
training_data = json.load(open('accupass_events.json','r'))
testing_data = json.load(open('fb_events.json','r'))

- Predict Accupass Events

In [3]:
for data in training_data:
    class_names = []
    scores = []
    for class_name, keyword_tfidf in keyword_tfidf_dict.items():
        score = 0
        for keyword in data['content_cut']:
            if keyword in keyword_tfidf:
                score += keyword_tfidf[keyword]
        scores.append(score)
        class_names.append(class_name)
    
    data['predicted_class'] = class_names[scores.index(max(scores))]

- Predict Facebook Events

In [4]:
for data in testing_data:
    class_names = []
    scores = []
    for class_name, keyword_tfidf in keyword_tfidf_dict.items():
        score = 0
        for keyword in data['content_cut']:
            if keyword in keyword_tfidf:
                score += keyword_tfidf[keyword]
        scores.append(score)
        class_names.append(class_name)
    
    data['predicted_class'] = class_names[scores.index(max(scores))]
    data['predicted_score'] = max(scores)

json.dump(testing_data,open('facebook_events_classification_result.json','w'))

- Training Error

In [5]:
count = 0
for data in training_data:
    if data['predicted_class'] != data['class']:
        count += 1
print(count/len(training_data))

0.5110716137154706


- Training Fusion Matrix

In [6]:
training_data_fusion_matrix = [[0 for i in range(13)] for i in range(13)]
class_names = []
for class_name in iter(keyword_tfidf_dict):
    class_names.append(class_name)
for data in training_data:
    training_data_fusion_matrix[class_names.index(data['predicted_class'])][class_names.index(data['class'])] += 1
training_data_fusion_matrix = pandas.DataFrame(training_data_fusion_matrix)
training_data_fusion_matrix.columns = class_names #actual_class
training_data_fusion_matrix.index = class_names #predicted_class
training_data_fusion_matrix

Unnamed: 0,fashion,entertainment,business,charity,health,arts,technology,education,sports,photography,food,travel,other
fashion,287,78,63,18,14,170,69,674,26,10,24,45,175
entertainment,1,1141,42,20,5,287,128,651,46,1,9,42,88
business,2,72,2667,91,19,326,219,3703,15,4,18,81,236
charity,1,47,84,777,15,181,64,921,18,3,11,26,124
health,2,50,22,56,685,176,26,702,46,2,17,17,52
arts,1,139,43,23,2,3523,32,1039,8,2,4,27,60
technology,1,63,335,18,0,133,1328,2117,29,0,2,16,151
education,0,18,194,13,3,69,21,5076,4,0,0,11,54
sports,5,113,10,17,75,82,18,493,705,3,10,19,27
photography,4,49,36,12,4,277,58,562,21,447,10,44,43


- Training F1-Score

In [7]:
training_data_f1_score = [[0 for i in range(13)] for i in range(3)]

class_names = []
for class_name in iter(keyword_tfidf_dict):
    class_names.append(class_name)

for i in range(13):
    training_data_f1_score[0][i] = training_data_fusion_matrix[class_names[i]][class_names[i]]/sum(training_data_fusion_matrix.ix[class_names[i]])
    training_data_f1_score[1][i] = training_data_fusion_matrix[class_names[i]][class_names[i]]/sum(training_data_fusion_matrix[class_names[i]])
    training_data_f1_score[2][i] = 2*training_data_f1_score[0][i]*training_data_f1_score[1][i]/(training_data_f1_score[0][i]+training_data_f1_score[1][i])
    
training_data_f1_score = pandas.DataFrame(training_data_f1_score)
training_data_f1_score.columns = class_names 
training_data_f1_score.index = ['precision', 'recall', 'f-score'] 

print(sum(training_data_f1_score.ix['f-score'])/len(training_data_f1_score.ix['f-score']))

0.483158813969


- Select top-20 keywords for each class

In [10]:
keyword_tfidf_dict = json.load(open('keyword_tfidf_dict.json','r'))

for class_name, keyword_tfidf in keyword_tfidf_dict.items():
    keyword_tfidf_dict[class_name] = sorted(keyword_tfidf.items(), key=operator.itemgetter(1), reverse=True)

for class_name, keyword_tfidf_tuple in keyword_tfidf_dict.items():
    count = 0
    for keyword_tfidf_pair in keyword_tfidf_tuple:
        count += 1
        print(class_name+','+keyword_tfidf_pair[0]+','+str(format(keyword_tfidf_pair[1],'f'))) 
        if count == 20:
            break

health,側彎,0.000969
health,脊椎,0.000380
health,幹細胞,0.000298
health,甩手,0.000294
health,掌紋,0.000225
health,嘜,0.000203
health,清道夫,0.000168
health,體位法,0.000148
health,觸身,0.000128
health,印可,0.000128
health,易筋經,0.000124
health,收款單,0.000118
health,矯正,0.000117
health,門診,0.000108
health,骨盆,0.000096
health,氣血,0.000091
health,痠痛,0.000089
health,物理治療師,0.000087
health,痰,0.000086
health,照顧,0.000085
education,禧,0.000119
education,陳光,0.000104
education,貼布,0.000085
education,課卷,0.000083
education,心智圖,0.000080
education,紮,0.000070
education,心算,0.000048
education,光明頂,0.000048
education,轉職,0.000040
education,談判,0.000040
education,聖杯,0.000039
education,空勤,0.000038
education,付清了,0.000037
education,技術分析,0.000037
education,中華工商,0.000036
education,操盤,0.000035
education,外匯,0.000035
education,勝率,0.000032
education,流量,0.000032
education,加薪,0.000031
fashion,香水,0.001032
fashion,試穿,0.000382
fashion,時裝展,0.000372
fashion,過肩,0.000366
fashion,穿起來,0.000360
fashion,模子,0.000344
fashion,穿到,0.000336
fashion,上流人士,0.000301
fashi

- Sort and select results by probability for each class

In [9]:
results = json.load(open('facebook_events_classification_result.json','r'))

classified_results = {}
for result in results:
    class_name = result['predicted_class']
    if class_name not in classified_results:
        classified_results[class_name] = []
    classified_results[class_name].append(result)

for class_name, events in classified_results.items():
    classified_results[class_name].sort(key=lambda x:x['predicted_score'], reverse=True)

f = open("top_100_events_by_class.csv", "w")

for class_name, events in classified_results.items():
    count = 0
    print(class_name, file=f, end ='')
    for event in events:
        count += 1
        print(','+event['id'], file=f, end ='') 
        if count == 100:
            break
    print('',file=f)

f.close()

- View result

In [10]:
events = json.load(open("fb_original_data.txt", "r"))

result = {}
for line in open("top_100_events_by_class.csv", "r"):
    class_events = line.strip('\n').split(',')
    result[class_events[0]] = []
    for i in range(1,11): 
        for event in events:
            if class_events[i] == event['id']:
                result[class_events[0]].append(event['name'])

for i,j in result.items():
    print(i)
    for jj in j:
        print(jj)

fashion
3/31前【L’AROMA滿額3000元，贈送生命數字基礎解析】
信義誠品國際香水聯合特賣
品香氛.玩調香
品香氛.玩調香
Satin美睫美學館-中壢店 4月各項優惠獨特方案來囉
Satin美睫美學館 4月各項優惠獨特方案來囉
LFP:單一核心配方香水創作活動
時尚巴黎【讓你愛上山茶花】
❤️Satin美睫美學館3月超質優惠來囉❤️ 慶祝林口店、中壢店週年慶
★拍照上傳穿搭照 分享你的必勝LOOK!
entertainment
神奇的蛻變遊戲同樂會
貓腳印 x TRPG貓貓大冒險 x 雙周團
貓腳印 x TRPG貓貓大冒險 x 雙周團
貓腳印 x TRPG貓貓大冒險 x 雙周團
貓腳印 x TRPG貓貓大冒險 x 雙周團
新溫羅汀實境遊戲－城市邊陲的遁逃者
2016 麗星郵輪夏日趣 ( &兔奴 )
好神奇的桌遊~【蛻變遊戲】試玩/同樂會
2016/05/24【心靈魔法遊戲】曾宇君 溝通師
沙漠風暴雷射射擊
business
【藝文活動】相聲瓦舍-賣橘子的
賣橘子的
賣橘子的
你的我的螢火蟲日記
台灣兒童巴西戰舞Capoeira國際課程與晉級活動
【河川修復—全紀錄分享會】—｜日式宿舍的建築ID｜
心築愛樂合唱團-音悅有愛慈善音樂會
布條訂購4/1-4/30
跨境電商交易風險─金流/物流/稅務精算獲利／萬幼筠
跨境電商在美國─如何在Amazon上賣爆紅商品／Anfernee
charity
「2016 星星相惜讓愛走動」
Holiday ya二手市集 愛心義賣jumble sale跳蚤市場x古著x身心靈占卜x Live演唱 x 手作甜點..一起幫助流浪兔&街友!20160814暑假再會場Charity Flea Market
Holiday ya二手市集 愛心義賣jumble sale跳蚤市場 x 古著vintage x占卜x Live演唱 x 手作甜點..一起幫助流浪兔&街友!20160724暑假首場Charity Flea Market
Holiday ya二手市集 愛心義賣jumble sale跳蚤市場 x 毛孩溝通 x占卜 x Live演唱 x 手作甜點..一起幫助流浪兔&街友!20160409上半年最後一場Charity Flea Market
T12睦育春嬉遊
巨量天燈垃圾！平溪淨山活動_4/3（日）9am
中華長跑協會志工【特殊訓練】Pa

In [11]:
results = json.load(open('facebook_events_classification_result.json','r'))

In [12]:
events = json.load(open("fb_original_data.txt", "r"))

In [13]:
len(events)

12786

In [14]:
len(results)

12786

In [15]:
for event in events:
    flag = 0
    for result in results:
        if event['id'] == result['id']:            
            event['predicted_class'] = result['predicted_class']
            event['predicted_score'] = result['predicted_score']
            flag = 1
            break
    if flag == 0:
        print(event['id'])

In [16]:
json.dump(events,open('classified_facebook_events.json','w'))

In [21]:
for data in testing_data:
    del data['content_cut']

In [28]:
f = open('facebook_events_classification_result.csv','w')
for data in testing_data:
    print(data['id']+','+data['predicted_class']+','+str(format(data['predicted_score'],'f')), file=f)

In [3]:
f2 = json.load(open('fb_original_data.txt', 'r'))

In [14]:
f2[0]['description']

'彰商校慶肆壹六\n\n蓊鬱的校園理\n藏著一個可以High翻屋頂的班級\n它們\n隱藏著\n笑聲可以從明德樓傳去後操場\n笑點比死海還低\n因笑而爆肺 每天要換肺數十顆\n笑梗比辭海還多\n智商比聖母峰還高\n的高人\n這個班級\n有堪比劉以豪、陶敏敏的男女神(經病)\n打LOL跟統神相著稱的陳嘉航\n射速三秒的孔子後代\n吼聲可以從基隆傳到鵝鑾鼻的大聲公\n倉鼠 土撥鼠 馬兒 章魚 金魚什麼動物通通有\n\n世上最令人意想不到的菜單\n鮮香粉嫩的波多奶\n用歌名詮釋的神秘飲品\n野豬騎士載著仲基進入您的胃\n資一1\n將使用奇特的魔力\n去征服您的味蕾\n\n彰商\n資一1\n歡 迎 你！'

In [4]:
f = open('top_100_keywords_for_each_class.csv', 'r')
for line in f.readlines():
    keywords = line.strip('\n').split(',')
    print(keywords[0])
    for i in range(1,21):
        print(keywords[i]+':', end='')
        count = 0 
        for event in f2:
            if keywords[i] in event['description']:
                count += 1
        print(count)
f.close()

health
側彎 日本瑜伽大師峯岸道子
側彎 透過參加日本瑜珈大師
側彎 7/21 (四) 【
側彎 4/4-8 9:00
側彎 跟父母的關係會深深的
側彎 六堂課程將有系統地依
脊椎 4/11-15 五日
脊椎 ░【彩虹祝福按摩培訓
脊椎 在本次研討會，您將學
脊椎 在本次研討會，您將學
脊椎 本次研討會，將學習如
脊椎 6/4-5 週末精華
脊椎 ★馬拉松專用 防水泡
脊椎 【講座緣由】
身體其
脊椎 課程簡介:
許多人在
脊椎 【7flow 特聘專
脊椎 每個人內在都有一個純
脊椎 用愛的荷爾蒙迎接新生
脊椎 ■課程班次時間（限８
脊椎 ■課程班次時間
5/
脊椎 骨架子頸椎健康枕是由
脊椎 Balanced B
脊椎 透過參加日本瑜珈大師
脊椎 你可曾好好的關愛自己
脊椎 健康永遠是任何生命有
脊椎 空中流動瑜伽師資培訓
脊椎 脊椎與內臟有緊密的關
脊椎 頌缽靜心孕婦瑜珈團課
脊椎 媽咪寶貝親密Pila
脊椎 寶寶按摩與觸動瑜珈

脊椎 一位孕婦媽咪對我說：
脊椎 7/21 (四) 【
脊椎 費登奎斯方法的動中覺
脊椎 身體養生個案：Reb
脊椎 Ranra的台南《德
脊椎 Ranra的台中班《
脊椎 頭薦骨平衡個案
頭薦
脊椎 http://goo
脊椎 4/4-8 9:00
脊椎 跟父母的關係會深深的
脊椎 PhysioYoga
脊椎 家中老公對您使用精油
脊椎 妳家老公對您使用精油
脊椎 六堂課程將有系統地依
脊椎 課程名稱: ROCK
脊椎 2016年高雄最專業
脊椎 《你的家，就在你心裡
脊椎 【課程資訊】

§ 
脊椎 【7flow 特聘專
幹細胞 【天地人跨產業沙龍 
幹細胞 ❖活動影片 清華大學
甩手 活動了解更多 htt
甩手 無聊瘋運動，【慢跑】
嘜 「威~力嘜去叨位」新
體位法 健康產業業者曾統計，
體位法 4/11-15 五日
體位法 YOGA + MED
體位法 「希瓦難陀瑜伽(Si
體位法 香氣記憶 ─ 尋找記
體位法 「人為何而生，死歸何
體位法 日本瑜伽大師峯岸道子
體位法 在本次研討會，您將學
體位法 在本次研討會，您將學
體位法 本次研討會，將學習如
體位法 反轉視野 – 倒立的
體位法 6/4-5 週末精華
體位法 西藏回春瑜珈只有五個
體位法 在此次全方位練習當中
體位法 ＊課程時間：

每一
體位法 流動的饗宴~傾