In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Get and Process Train Data
df_train = pd.read_csv('training.csv')
text_data_train = np.array(df_train['article_words'])
counter = CountVectorizer()
words_statistics_train = counter.fit_transform(text_data_train)
topic_index = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_train = []
for i in range(len(df_train)):
    Y_train.append(topic_index[df_train.at[i, 'topic']])

X_train_train = words_statistics_train[:9001]
X_train_dev = words_statistics_train[9001:]
Y_train_train = Y_train[:9001]
Y_train_dev = Y_train[9001:]


In [3]:
# Get and Process Test Data
df_test = pd.read_csv('test.csv')
text_data_test = np.array(df_test['article_words'])
ids = np.array(df_test['article_number'])
words_statistics_test = counter.transform(text_data_test)
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic_index[df_test.at[i, 'topic']])

X_test = words_statistics_test[:]


In [4]:
# RandomForestClassifier
print('RandomForestClassifier')
classifier = RandomForestClassifier()
model = classifier.fit(X_train_train, Y_train_train)
predicted_train = model.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = model.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = model.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))



RandomForestClassifier




Accuracy score of training data: 0.9786690367736918
Accuracy score of development data: 0.7334669338677354
Accuracy score of test data: 0.716


In [5]:
# BernoulliNB
print('BernoulliNB')
classifier = BernoulliNB()
model = classifier.fit(X_train_train, Y_train_train)
predicted_train = model.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = model.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = model.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))



BernoulliNB
Accuracy score of training data: 0.732140873236307
Accuracy score of development data: 0.7194388777555111
Accuracy score of test data: 0.674


In [6]:
# MultinomialNB
print('MultinomialNB')
classifier = MultinomialNB()
model = classifier.fit(X_train_train, Y_train_train)
predicted_train = model.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = model.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = model.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))



MultinomialNB
Accuracy score of training data: 0.8287968003555161
Accuracy score of development data: 0.7314629258517034
Accuracy score of test data: 0.722


In [7]:
# DecisionTreeClassifier
print('DecisionTreeClassifier')
classifier = DecisionTreeClassifier()
model = classifier.fit(X_train_train, Y_train_train)
predicted_train = model.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = model.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = model.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))



DecisionTreeClassifier
Accuracy score of training data: 0.989667814687257
Accuracy score of development data: 0.7034068136272545
Accuracy score of test data: 0.676


In [8]:
# SVM
print('SVM')
classifier = SVC(gamma='auto')
model = classifier.fit(X_train_train, Y_train_train)
predicted_train = model.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = model.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = model.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))




SVM
Accuracy score of training data: 0.5684923897344739
Accuracy score of development data: 0.5871743486973948
Accuracy score of test data: 0.604


In [9]:
# KNeighborsClassifier
print('KNeighborsClassifier')
classifier = KNeighborsClassifier(n_neighbors=8)
classifier.fit(X_train_train, Y_train_train)
predicted_train = classifier.predict(X_train_train)

print('Accuracy score of training data:',accuracy_score(Y_train_train, predicted_train))
predicted_dev = classifier.predict(X_train_dev)

print('Accuracy score of development data:',accuracy_score(Y_train_dev, predicted_dev))

result = classifier.predict(X_test)
print('Accuracy score of test data:',accuracy_score(Y_test, result))



KNeighborsClassifier
Accuracy score of training data: 0.7329185646039329
Accuracy score of development data: 0.6753507014028056
Accuracy score of test data: 0.65


In [10]:
# Using MultinomialNB
print('MultinomialNB Predict')
classifier = MultinomialNB()
model = classifier.fit(X_train_train, Y_train_train)

result_class = model.predict(X_test)
result_proba = model.predict_proba(X_test)

class_lists = []

# put every instance into its group
for i in range(11):
    class_lists.append([])
    for j in range(500):
        if result_class[j] == i:
            class_lists[i].append((result_proba[j][i], ids[j]))

# sort every class list
for i in range(11):
    class_lists[i].sort(reverse=True)

# get suggest list
suggested = np.zeros((11, 10))

for i in range(11):
    length = min(len(class_lists[i]), 10)
    for j in range(length):
        suggested[i][j] = class_lists[i][j][1]
    
print(suggested)

MultinomialNB Predict
[[9996. 9994. 9993. 9991. 9983. 9976. 9973. 9971. 9970. 9960.]
 [9604. 9830. 9789. 9952.    0.    0.    0.    0.    0.    0.]
 [9758. 9940. 9878. 9988. 9933. 9854. 9526. 9896. 9703. 9575.]
 [9987. 9783. 9773. 9770. 9616. 9607. 9576. 9559. 9579. 9842.]
 [   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [9693. 9584. 9572. 9727. 9823. 9625. 9704. 9743. 9711. 9599.]
 [9982. 9929. 9873. 9807. 9661. 9621. 9947. 9617. 9735. 9937.]
 [9995. 9985. 9967. 9939. 9901. 9863. 9835. 9816. 9769. 9765.]
 [   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [9601.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [9997. 9992. 9979. 9964. 9942. 9931. 9922. 9920. 9919. 9886.]]


In [11]:
# Print suggestions
print("ARTS CULTURE ENTERTAINMENT: \t\t", suggested[1])
print("BIOGRAPHIES PERSONALITIES PEOPLE: \t", suggested[2])
print("DEFENCE: \t\t\t\t", suggested[3])
print("DOMESTIC MARKETS: \t\t\t", suggested[4])
print("FOREX MARKETS: \t\t\t\t", suggested[5])
print("HEALTH: \t\t\t\t", suggested[6])
print("MONEY MARKETS: \t\t\t\t", suggested[7])
print("SCIENCE AND TECHNOLOGY: \t\t", suggested[8])
print("SHARE LISTINGS: \t\t\t", suggested[9])
print("SPORTS: \t\t\t\t", suggested[10])


ARTS CULTURE ENTERTAINMENT: 		 [9604. 9830. 9789. 9952.    0.    0.    0.    0.    0.    0.]
BIOGRAPHIES PERSONALITIES PEOPLE: 	 [9758. 9940. 9878. 9988. 9933. 9854. 9526. 9896. 9703. 9575.]
DEFENCE: 				 [9987. 9783. 9773. 9770. 9616. 9607. 9576. 9559. 9579. 9842.]
DOMESTIC MARKETS: 			 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
FOREX MARKETS: 				 [9693. 9584. 9572. 9727. 9823. 9625. 9704. 9743. 9711. 9599.]
HEALTH: 				 [9982. 9929. 9873. 9807. 9661. 9621. 9947. 9617. 9735. 9937.]
MONEY MARKETS: 				 [9995. 9985. 9967. 9939. 9901. 9863. 9835. 9816. 9769. 9765.]
SCIENCE AND TECHNOLOGY: 		 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
SHARE LISTINGS: 			 [9601.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
SPORTS: 				 [9997. 9992. 9979. 9964. 9942. 9931. 9922. 9920. 9919. 9886.]
