In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.neural_network import MLPClassifier

df_train = pd.read_csv("training.csv")
text_data = np.array(df_train['article_words'])
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
X = bag_of_words
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y = []
for i in range(len(df_train)):
    Y.append(topic[df_train.at[i, 'topic']])
    
X_train = X[:9001]
X_dev = X[9001:]
Y_train = Y[:9001]
Y_dev = Y[9001:]

In [19]:
print('MLPClassifier')
mlp = MLPClassifier()
model = mlp.fit(X, Y) 
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv') 
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
prob = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    prob.append(model.predict_proba(tmp)[0])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

Training data accuracy score is: 0.9886679257860238
Development data accuracy score is: 0.9859719438877755
Test data accuracy score is: 0.764


In [20]:
#统计每个article对应的所有probability
probability = np.array(prob).T
probability.shape

mi = dict(zip(topic.values(), topic.keys()))
for i in range(len(result)):
    result[i] = mi[result[i]]
    
data = {'Article':pd.Series(df_test['article_number']),
       'ARTS CULTURE ENTERTAINMENT':pd.Series(probability[1]),
        'BIOGRAPHIES PERSONALITIES PEOPLE':pd.Series(probability[2]),
        'DEFENCE':pd.Series(probability[3]),
       'DOMESTIC MARKETS':pd.Series(probability[4]),
       'FOREX MARKETS':pd.Series(probability[5]),
       'HEALTH':pd.Series(probability[6]),
       'MONEY MARKETS':pd.Series(probability[7]),
       'SCIENCE AND TECHNOLOGY':pd.Series(probability[8]),
       'SHARE LISTINGS':pd.Series(probability[9]),
        'SPORTS':pd.Series(probability[10]),
       }
df = pd.DataFrame(data)

for i in range(1,len(df.columns)):
    tmp = df.sort_values(by=[df.columns[i]]).head(10)
    print(df.columns[i], list(tmp['Article']))

In [31]:
# 统计predict_result对应的probability
score = []
for i in prob:
    score.append(max(i))

r = {'Article':pd.Series(df_test['article_number']),
     'Result':pd.Series(np.array(result)),
     'Probability': pd.Series(np.array(score))       
       }
predict_prob = pd.DataFrame(r)

tp = list(topic.keys())
for i in range(1, len(tp)):
    tmp = predict_prob[predict_prob['Result'] == tp[i]].sort_values(by=['Probability'],ascending=False).head(10)
    print(tp[i], list(tmp['Article']))

In [50]:
print('SGDClassifier')
from sklearn.calibration import CalibratedClassifierCV
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge',class_weight='balanced')
clf = clf.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model=calibrator.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
prob = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    prob.append(model.predict_proba(tmp)[0])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

SGDClassifier


  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)


Training data accuracy score is: 0.9397844683924008
Development data accuracy score is: 0.7234468937875751
Test data accuracy score is: 0.726


In [43]:
print('SVM')
clf = SVC()
model = clf.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

SVM
Training data accuracy score is: 0.896900344406177
Development data accuracy score is: 0.7535070140280561
Test data accuracy score is: 0.738


In [47]:
model = XGBClassifier()
model.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

Training data accuracy score is: 0.847683590712143
Development data accuracy score is: 0.7675350701402806
Test data accuracy score is: 0.736


In [12]:
print('RandomForestClassifier')
clf = RandomForestClassifier()
model = clf.fit(X, Y)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))

df_test = pd.read_csv('test.csv') 
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
prob = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    prob.append(model.predict_proba(tmp)[0])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))


RandomForestClassifier




Training data accuracy score is: 0.9766692589712255
Development data accuracy score is: 0.9739478957915831
Test data accuracy score is: 0.718


In [55]:
print('MultinomialNB')
clf = MultinomialNB()
model = clf.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

MultinomialNB
Training data accuracy score is: 0.8287968003555161
Development data accuracy score is: 0.7314629258517034
Test data accuracy score is: 0.722


In [54]:
print('BernoulliNB')
clf = BernoulliNB()
model = clf.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

BernoulliNB
Training data accuracy score is: 0.732140873236307
Development data accuracy score is: 0.7194388777555111
Test data accuracy score is: 0.674


In [56]:
print('DecisionTreeClassifier')
clf = tree.DecisionTreeClassifier()
model = clf.fit(X_train, Y_train)
predicted_train = model.predict(X_train)
print('Training data accuracy score is:',accuracy_score(Y_train, predicted_train))
predicted_dev = model.predict(X_dev)
print('Development data accuracy score is:',accuracy_score(Y_dev, predicted_dev))
df_test = pd.read_csv('test.csv')
test_data = np.array(df_test['article_words'])
topic = {'IRRELEVANT':0, 'ARTS CULTURE ENTERTAINMENT':1, 'BIOGRAPHIES PERSONALITIES PEOPLE':2, 'DEFENCE':3, 'DOMESTIC MARKETS':4, 'FOREX MARKETS':5, 'HEALTH':6, 'MONEY MARKETS':7, 'SCIENCE AND TECHNOLOGY':8, 'SHARE LISTINGS':9, 'SPORTS':10}
Y_test = []
for i in range(len(df_test)):
    Y_test.append(topic[df_test.at[i, 'topic']])
result = []
for i in range(len(df_test)):
    tmp = count.transform([test_data[i]])
    result.append(int(model.predict(tmp)[0]))
print('Test data accuracy score is:',accuracy_score(Y_test, result))

DecisionTreeClassifier
Training data accuracy score is: 0.989667814687257
Development data accuracy score is: 0.7114228456913828
Test data accuracy score is: 0.666
