In [8]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# python3.6
# windows7 & pycharm

# original author: https://github.com/sunxiangguo/chinese_text_classification
# improve: ljzhou

# target: Chinese text(financial news) segment


import os
import jieba
from Tools import savefile, readfile


# corpus_path is the unsegmented corpus path
# seg_path is the segmented corpus path
def corpus_segment(corpus_path, seg_path):   
    catelist = os.listdir(corpus_path)  # Gets all subdirectories under corpus_path
    ## In fact, the name of subdirectories is the category, and you can use more subdirectories in your object
    print("Segmenting..Please wait.")
    
    # Gets all the files under each directory (category)
    for mydir in catelist:
        class_path = corpus_path + mydir + "/"
        seg_dir = seg_path + mydir + "/"
        if not os.path.exists(seg_dir):  # Whether there is a word segmentation directory, if not, create it
            os.makedirs(seg_dir)
        file_list = os.listdir(class_path)  # Get all the text in a category in an unsegmented term repository
            
    # Traverse all files in the category directory and to process
        for file_path in file_list:
            fullname = class_path + file_path
            content = readfile(fullname)
            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # Delete line breaks
            content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # Delete empty lines, extra spaces
            content_seg = jieba.cut(content)  # segment
            savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8'))  # Save the segmented file
    print("Finish.")

    # Segmentation of train and test sets.
if __name__ == "__main__":
    corpus_path = "./train_corpus/"  # Unsegmented
    seg_path = "./train_corpus_seg/"  # Segmented
    corpus_segment(corpus_path, seg_path)
    
    corpus_path = "./test_corpus/"  # Unsegmented
    seg_path = "./test_corpus_seg/"  # Segmented
    corpus_segment(corpus_path, seg_path)


Segmenting..Please wait.
Finish.
Segmenting..Please wait.
Finish.


In [9]:
# target: Create a Bunch

import os
import pickle

from sklearn.datasets.base import Bunch
from Tools import readfile

    # Create a Bunch
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)    # Expand the original list with the new list (addlist)
    
    # Add an element to the original list
    for mydir in catelist:
        class_path = seg_path + mydir + "/"
        file_list = os.listdir(class_path)
        for file_path in file_list:
            fullname = class_path + file_path
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))

    # Store bunch in wordbag_path
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("Bunch Finish")

    # Bunch of train and test set.
if __name__ == "__main__":
    wordbag_path = "train_word_bag/train_set.dat"  # Bunch path
    seg_path = "train_corpus_seg/"  # segmented corpus path
    corpus2Bunch(wordbag_path, seg_path)

    wordbag_path = "test_word_bag/test_set.dat"  # Bunch path
    seg_path = "test_corpus_seg/"  # segmented corpus path
    corpus2Bunch(wordbag_path, seg_path)


Bunch Finish
Bunch Finish


In [10]:
# target: Create a word bag

from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj, writebunchobj

    # Remove stop words
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
    stpwrdlst = readfile(stopword_path).splitlines()    # stop words file read
    bunch = readbunchobj(bunch_path)
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
                       vocabulary={})
    
    # Build a word bag by using TF-IDF
    if train_tfidf_path is not None:
        trainbunch = readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,
                                     vocabulary=trainbunch.vocabulary)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_

    writebunchobj(space_path, tfidfspace)
    print("TF-IDF finish")

    # word bag of train and test set
if __name__ == '__main__':
    stopword_path = "train_word_bag/hlt_stop_words.txt"    # Stop word file
    bunch_path = "train_word_bag/train_set.dat"    # Bunch path
    space_path = "train_word_bag/tfdifspace.dat"    # TF-IDF path
    vector_space(stopword_path, bunch_path, space_path)

    bunch_path = "test_word_bag/test_set.dat"    # Bunch path
    space_path = "test_word_bag/testspace.dat"    # Word bag path
    train_tfidf_path = "train_word_bag/tfdifspace.dat"    # TF-IDF path
    vector_space(stopword_path, bunch_path, space_path, train_tfidf_path)


TF-IDF finish
TF-IDF finish


In [11]:
# target: caculate the sentiments of news and take model evaluation

from sklearn.naive_bayes import BernoulliNB 
from sklearn import svm
from sklearn import tree
from sklearn import metrics
from Tools import readbunchobj
import csv

    # train set
trainpath = "train_word_bag/tfdifspace.dat"
train_set = readbunchobj(trainpath)

    # test set
testpath = "test_word_bag/testspace.dat"
test_set = readbunchobj(testpath)

    # Training classifier: input word bag vector and classification label
B_NB = BernoulliNB(alpha=0.001).fit(train_set.tdm, train_set.label)
SVC = svm.SVC(C=0.8,kernel='sigmoid',gamma=10,decision_function_shape='ovo',probability=True).fit(train_set.tdm, train_set.label)
Tree = tree.DecisionTreeClassifier(min_samples_leaf=30,min_samples_split=10).fit(train_set.tdm, train_set.label)

    # Predict classification results
pre_B_NB = B_NB.predict(test_set.tdm)
pre_SVC = SVC.predict(test_set.tdm)
pre_Tree = Tree.predict(test_set.tdm)


    # If you want to check the wrong prediction, you can use this.
for flabel, file_name, expct_cate in zip(test_set.label, test_set.filenames, pre_B_NB):    # Please change the pre_B_NB
    if flabel != expct_cate:
        print(file_name, ": Correct category:", flabel, " -->Prediction:", expct_cate)


    # Calculate classification accuracy
def metrics_result(actual, predict):
    print('Precise:{0:.3f}'.format(metrics.precision_score(actual, predict, average='weighted')))
    print('Recall:{0:0.3f}'.format(metrics.recall_score(actual, predict, average='weighted')))
    print('F1-Score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted')))
    print()
    
    # evaluation
print('B_NB:')
metrics_result(test_set.label, pre_B_NB)
print('SVC:')
metrics_result(test_set.label, pre_SVC)
print('Tree:')
metrics_result(test_set.label, pre_Tree)

test_corpus_seg/Neg_news/11.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/24.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/25.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/3.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/41.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/48.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/51.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/6.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/60.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/62.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/64.txt : Correct category: Neg_news  -->Prediction: Pos_news
test_corpus_seg/Neg_news/65.txt : Correct category: Neg_