- Import Packages

In [11]:
import json
import jieba
import pandas
import random
import numpy

- Load Data

In [12]:
tf_dict = json.load(open('tf_dict.json','r'))
idf_dict = json.load(open('idf_dict.json','r'))
tfidf_dict = json.load(open('tfidf_dict.json','r'))
train_data = json.load(open('train_data.json','r'))
test_data = json.load(open('test_data.json','r'))

- Ratio Model

In [13]:
event_class_count_dict = {}
for data in train_data:
    if data['class'] not in event_class_count_dict:
        event_class_count_dict[data['class']] = 0
    event_class_count_dict[data['class']] += 1

event_class_ratio_dict = {}
for event_class, count in event_class_count_dict.items():
    event_class_ratio_dict[event_class] = count/len(train_data)

In [14]:
event_class_ratio_dict

{'arts': 0.1915369038474967,
 'business': 0.1992877592346903,
 'charity': 0.0568396061727533,
 'education': 0.0026534459884086305,
 'entertainment': 0.035262900635430486,
 'fashion': 0.01682843376859158,
 'food': 0.06305425598771036,
 'health': 0.04489909922491446,
 'other': 0.10236715313176455,
 'photography': 0.023043083583548635,
 'sports': 0.049088750785559666,
 'technology': 0.11731024369806578,
 'travel': 0.09782836394106557}

- Predict Training Data

In [15]:
event_class_list = []
for event_class in iter(tfidf_dict):
    event_class_list.append(event_class)
for data in train_data:    
    guess = numpy.random.choice(13, 1, p=[event_class_ratio_dict[event_class_list[0]],
                                  event_class_ratio_dict[event_class_list[1]],
                                  event_class_ratio_dict[event_class_list[2]],
                                  event_class_ratio_dict[event_class_list[3]],
                                  event_class_ratio_dict[event_class_list[4]],
                                  event_class_ratio_dict[event_class_list[5]],
                                  event_class_ratio_dict[event_class_list[6]],
                                  event_class_ratio_dict[event_class_list[7]],
                                  event_class_ratio_dict[event_class_list[8]],
                                  event_class_ratio_dict[event_class_list[9]],
                                  event_class_ratio_dict[event_class_list[10]],
                                  event_class_ratio_dict[event_class_list[11]],
                                  event_class_ratio_dict[event_class_list[12]],])[0]
    data['predicted_class'] = event_class_list[guess]

- Predict Testing Data

In [16]:
event_class_list = []
for event_class in iter(tfidf_dict):
    event_class_list.append(event_class)
for data in test_data:
    guess = numpy.random.choice(13, 1, p=[event_class_ratio_dict[event_class_list[0]],
                                  event_class_ratio_dict[event_class_list[1]],
                                  event_class_ratio_dict[event_class_list[2]],
                                  event_class_ratio_dict[event_class_list[3]],
                                  event_class_ratio_dict[event_class_list[4]],
                                  event_class_ratio_dict[event_class_list[5]],
                                  event_class_ratio_dict[event_class_list[6]],
                                  event_class_ratio_dict[event_class_list[7]],
                                  event_class_ratio_dict[event_class_list[8]],
                                  event_class_ratio_dict[event_class_list[9]],
                                  event_class_ratio_dict[event_class_list[10]],
                                  event_class_ratio_dict[event_class_list[11]],
                                  event_class_ratio_dict[event_class_list[12]],])[0]
    data['predicted_class'] = event_class_list[guess]

- Training Error

In [17]:
count = 0
for data in train_data:
    if data['predicted_class'] != data['class']:
        count += 1
print(count/len(train_data))
    

0.8710983869841491


- Testing Error

In [18]:
count = 0
for data in test_data:
    if data['predicted_class'] != data['class']:
        count += 1
print(count/len(test_data))
    

0.8815973191845853


- Training Fusion Matrix

In [19]:
train_data_fusion_matrix = [[0 for i in range(13)] for i in range(13)]
event_class_list = []
for event_class in iter(tfidf_dict):
    event_class_list.append(event_class)
for data in train_data:
    train_data_fusion_matrix[event_class_list.index(data['predicted_class'])][event_class_list.index(data['class'])] += 1
train_data_fusion_matrix = pandas.DataFrame(train_data_fusion_matrix)
train_data_fusion_matrix.columns = event_class_list #actual_class
train_data_fusion_matrix.index = event_class_list #predicted_class
train_data_fusion_matrix

Unnamed: 0,food,travel,photography,business,entertainment,fashion,arts,education,other,sports,technology,charity,health
food,53,104,22,186,26,22,163,2,119,38,113,55,42
travel,92,134,31,270,40,18,251,3,119,70,166,74,63
photography,23,36,11,87,9,4,74,0,28,18,46,12,10
business,178,249,60,565,112,45,544,9,297,137,321,157,131
entertainment,35,53,11,99,23,10,94,2,37,23,53,22,25
fashion,10,27,6,45,14,2,45,1,22,10,24,10,14
arts,180,282,66,535,93,46,547,5,271,140,328,174,105
education,2,4,2,10,2,1,2,0,4,3,5,3,2
other,91,157,33,276,43,29,269,3,175,77,159,95,71
sports,42,68,12,149,28,11,127,4,77,39,81,29,32


- Testing Fusion Matrix

In [20]:
test_data_fusion_matrix = [[0 for i in range(13)] for i in range(13)]
event_class_list = []
for event_class in iter(tfidf_dict):
    event_class_list.append(event_class)
for data in test_data:
    test_data_fusion_matrix[event_class_list.index(data['predicted_class'])][event_class_list.index(data['class'])] += 1
test_data_fusion_matrix = pandas.DataFrame(test_data_fusion_matrix)
test_data_fusion_matrix.columns = event_class_list #actual_class
test_data_fusion_matrix.index = event_class_list #predicted_class
test_data_fusion_matrix

Unnamed: 0,food,travel,photography,business,entertainment,fashion,arts,education,other,sports,technology,charity,health
food,14,22,8,38,10,4,38,0,29,10,18,11,9
travel,33,25,12,74,11,4,72,1,36,13,37,18,18
photography,7,8,0,11,2,1,17,0,10,3,4,4,4
business,67,74,23,135,22,10,149,0,84,40,76,39,35
entertainment,3,10,3,21,1,1,21,0,13,8,16,5,5
fashion,3,4,3,10,3,2,8,0,8,4,10,5,1
arts,34,79,23,128,21,8,126,0,83,34,86,43,28
education,0,0,0,2,0,0,4,0,1,1,0,1,0
other,24,27,13,79,14,4,66,1,45,17,39,25,16
sports,13,23,4,47,9,4,37,1,13,10,15,9,3
