In [1]:
import pandas as pd
import sys
import numpy as np
import pickle
import os
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

df_file = pd.read_excel('../data/patentData_80000_20180622.xlsx')

df_file['X'] = df_file['标题'] + '。' + df_file['摘要'] + '。' + df_file['首项权利要求']

import jieba
stopwordfile = open('../dict/StopWords_CON.txt', 'r', encoding='utf-8')
def getstopword():
    w = set()
    for line in stopwordfile:
        line.strip().split('\n')
        w.add(line[:len(line)-1].strip())
    return w

stopwordset = getstopword()

def cutWords(sentence):
    word_list = jieba.cut(sentence)
    res = ' '.join(word_list)
    res = res.split(' ')
    tempX = ''
    for i in res:
        if i not in stopwordset:
            tempX+=i
            tempX+=' '
    return tempX.strip()

df_file['X'] = df_file['X'].apply(cutWords)

label2id = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7
}

def transy(x):
    return label2id[x[0]]

df_file['y'] = df_file['主IPC分类号-小类'].apply(transy)

document = df_file['X'].values

tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b").fit_transform(document)
y = df_file['y'].values
X_train, X_test, y_train, y_test = train_test_split(tfidf_model, y, test_size=0.25, random_state=42)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.748 seconds.
Prefix dict has been built succesfully.


In [2]:
models = {}
models['LR'] = LogisticRegression()
models['LR'].fit(X_train[:], y_train[:])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [3]:
models['DT'] = DecisionTreeClassifier()
models['DT'].fit(X_train[:], y_train[:])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [4]:
models['RF'] = RandomForestClassifier()
models['RF'].fit(X_train[:], y_train[:])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
from sklearn import metrics

y_pred_LR = models['LR'].predict(X_test[:])
y_pred_DT = models['DT'].predict(X_test[:])
y_pred_RF = models['RF'].predict(X_test[:])




In [16]:
def Result(y_test, y_pred):
    print('precision:')
    print(metrics.precision_score(y_test, y_pred, average='micro'))
    print(metrics.precision_score(y_test, y_pred, average='macro'))

    print('recall:')
    print(metrics.recall_score(y_test, y_pred, average='micro'))
    print(metrics.recall_score(y_test, y_pred, average='macro'))

    print('f1:')
    print(metrics.f1_score(y_test, y_pred, average='micro'))
    print(metrics.f1_score(y_test, y_pred, average='macro'))

In [17]:
Result(y_test[:], y_pred_LR)

precision:
0.7784
0.7805038246164653
recall:
0.7784
0.7784240384751246
f1:
0.7784
0.7786504654631667


In [18]:
Result(y_test[:], y_pred_DT)

precision:
0.57555
0.5766231594612026
recall:
0.57555
0.5753772700275375
f1:
0.57555
0.5757351142636977


In [19]:
Result(y_test[:], y_pred_RF)

precision:
0.7106
0.7151842920683658
recall:
0.7106
0.7109231776331095
f1:
0.7106
0.7071120024947544
