In [17]:
from gensim import corpora, models, similarities

In [5]:
def load_data(data_path):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_len = 0
    with open(data_path, 'r') as f:
        for line in f:
            line_list = line.split('\t')
            one_data = line_list[1].split()
            tmp_len = len(one_data)
            
            if tmp_len > max_sentence_len:
                max_sentence_len = tmp_len
            if tmp_len > 2000:
                data.append(one_data)
                labels.append(int(line_list[2]))
        f.close()
    print("max sentence length: ", max_sentence_len)
    return data, labels

In [6]:
data_path = "./data/seg_train.txt"
texts_data, labels = load_data(data_path)
train_len = int(len(texts_data) * 0.8)

train_labels = labels[:train_len]
print("train data size:", train_len)
test_labels = labels[train_len:]
print("Test data size:", len(test_labels))

max sentence length:  34445
train data size: 2304
Test data size: 576


In [57]:
dictionary = corpora.Dictionary(texts_data)

In [12]:
for d in dictionary:
    print(d, dictionary[d])
    break

0 原


In [58]:
all_data = [dictionary.doc2bow(text) for text in texts_data]

In [60]:
print(all_data[0][:10])
print(texts_data[0][:10])

[(0, 1), (1, 1), (2, 2), (3, 96), (4, 2), (5, 160), (6, 6), (7, 66), (8, 13), (9, 63)]
['原', '公诉', '机关', '榆阳区', '人民检察院', '。', '上诉人', '（', '原审', '被告人']


In [61]:
tfidf = models.TfidfModel(all_data)

In [62]:
print(tfidf)

TfidfModel(num_docs=2880, num_nnz=2166180)


In [63]:
all_tfidf = tfidf[all_data]

In [64]:
for doc in all_tfidf:
    print(doc[:10])
    break

[(0, 0.0010654387171305018), (1, 8.719103299325264e-07), (2, 4.357279997956249e-07), (3, 0.38279391820585335), (4, 4.801350924781117e-06), (6, 0.011072534005127532), (7, 0.0001440154466646668), (8, 0.022935089726860727), (10, 0.00016084525598016738), (11, 0.07519467250843499)]


In [65]:
lsi = models.LsiModel(corpus=all_tfidf, id2word=dictionary, num_topics=400)

In [66]:
for doc_tfidf in all_tfidf:
    print(len(lsi[doc_tfidf]))
    print(lsi[doc_tfidf])
    break

400
[(0, 0.048935600705997992), (1, -0.00033824766328687547), (2, -0.029810549010627151), (3, -0.00046035841408168833), (4, 0.011183889436095155), (5, 0.014992234619109495), (6, -0.0052395769849194864), (7, 0.018614289908117213), (8, -0.031395988756505087), (9, -0.0065443372547794379), (10, -0.029541753652688536), (11, 0.034932403034077457), (12, 0.009490200770590063), (13, 0.0059885277712774807), (14, -0.0024555548078429514), (15, -0.0026562899507100502), (16, -0.0068812356833876645), (17, -0.018334875353280398), (18, -0.0044528324283064115), (19, 0.019251951650470389), (20, 0.014689196619493637), (21, 0.022912644244577021), (22, -0.0084944003719351208), (23, -0.02579108647915395), (24, 0.0040418352800535772), (25, -0.026290202237577089), (26, 0.015151984228273839), (27, -0.028593425651079307), (28, 0.0011857336944904964), (29, -0.0053969602434991272), (30, 0.013123667247417968), (31, -0.017493569328385552), (32, -0.024873501963115184), (33, 0.022128070390884245), (34, -0.006968940393

In [67]:
all_X = []
for doc_tfidf in all_tfidf:
    all_X.append([i[1] for i in lsi[doc_tfidf]])

In [68]:
X_train = all_X[:train_len]
X_test = all_X[train_len:]

In [69]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X_train, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [70]:
predicted = clf.predict(X_test)

In [71]:
import numpy as np
np.mean(predicted == test_labels)

0.61458333333333337

In [72]:
from sklearn import metrics
print(metrics.classification_report(test_labels, predicted))

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         9
          2       0.00      0.00      0.00        19
          3       0.00      0.00      0.00        26
          4       0.00      0.00      0.00        15
          5       0.00      0.00      0.00        47
          6       0.00      0.00      0.00        81
          7       0.61      1.00      0.76       354
          8       0.00      0.00      0.00        25

avg / total       0.38      0.61      0.47       576



  'precision', 'predicted', average, warn_for)
