In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report

In [2]:
category_names = ['餐饮','餐饮美食','地名','公共设施','公司企业','购物','交通运输','教育培训','教育文化','金融保险','金融银行','景点游玩','酒店住宿','居民服务','科研机构','农林牧渔业','批发零售','其他','汽车','汽车服务','商业设施服务','生活服务','休闲娱乐','医疗','医疗服务','运动休闲','住宿']
sogou_category_names = ['宾馆饭店','餐饮服务','场馆会所','地名','房地产','公司企业','购物场所','交通出行','金融银行','旅游景点','其它','汽车服务','体育场馆','新闻媒体','休闲娱乐','学校科研','医疗卫生','邮政电信','政府机关']

category_name_dict={}
for i,v in enumerate(sogou_category_names):
    category_name_dict[v]=i

In [3]:
def covert_label(category):
    return category_name_dict[category]

In [4]:
def calclate_dscore(docs_values,len_doc=3):    
    dw = len(docs_values)
    pw = dw/len_doc
    max_freq = np.max(docs_values)
    min_freq = np.min(docs_values)
    tw = max_freq/min_freq
    score = (1+pw*tw)*dw
    return score

In [5]:
df = pd.read_csv('D:\\projectlist\\navCategory\\100000segment_nav_standard_poi',sep='\t',names=['query','category','data'],encoding='gb18030')
df = df.dropna()

In [6]:
vectorizer=CountVectorizer()
df['label'] = df['category'].apply(func=covert_label)
train,test = train_test_split(df,train_size  = 0.7)
term_matrix = vectorizer.fit_transform(train['data'])
feature_names=vectorizer.get_feature_names()
transpose_term_matrix = term_matrix.transpose()
transpose_term_lil_data = transpose_term_matrix.tolil().data
doc_len = len(feature_names)

In [7]:
feature_names_dict={}
for i,v in enumerate(feature_names):
    score = calclate_dscore(transpose_term_lil_data[i],doc_len)
    feature_names_dict[v]=score

In [8]:
sort_feature_names_dict = sorted(feature_names_dict.items(),key=lambda d:d[1],reverse=True)

In [1]:
for k in range(2500,20000,2500):
    filter_feature_names = [ v[0] for i,v in enumerate(sort_feature_names_dict) if i<=k]
    
    train_y = train.label.values
    test_y = test.label.values


    step1 = ('count_vectorizer',CountVectorizer(vocabulary=filter_feature_names))
    step2=('tf_transformer',TfidfTransformer())#该类会统计每个词语的tf-idf权值
    step3 =('blb_clf',BernoulliNB())

    pipeline = Pipeline(steps=[step1, step2,step3])

    parameters = {
        'tf_transformer__use_idf': [True, False],
        'blb_clf__alpha': [0.01,0.5],
    }
    
    grid_search = GridSearchCV(pipeline, param_grid=parameters)
    grid_search.fit(train['data'],train_y)
    test_y_pre = grid_search.predict(test['data'])
    test_report = classification_report(test_y, test_y_pre, target_names=category_names)
    
    print('feature_names num:{num},the best score{score}'.format(num=k,score=grid_search.best_score_))
    print('the best param{param}'.format(param=grid_search.best_params_))
    print('test_report:{test_report}'.format(test_report=test_report))

NameError: name 'sort_feature_names_dict' is not defined

In [14]:
train_y = train.label.values
test_y = test.label.values
step0 = ('count_vectorizer',CountVectorizer())
step1 = ('selectKBest',SelectKBest(chi2))
step2=('tf_transformer',TfidfTransformer())#该类会统计每个词语的tf-idf权值
step3 =('blb_clf',BernoulliNB())

pipeline = Pipeline(steps=[step0,step1, step2,step3])

parameters = {
    'selectKBest__k':[2500,5000,7500,10000],
    'tf_transformer__use_idf': [True, False],
    'blb_clf__alpha': [0.01,0.5],
}

grid_search = GridSearchCV(pipeline, param_grid=parameters)
grid_search.fit(train['data'],train_y)
test_y_pre = grid_search.predict(test['data'])
test_report = classification_report(test_y, test_y_pre, target_names=category_names)

print('feature_names num:{num},the best score{score}'.format(num=k,score=grid_search.best_score_))
print('the best param{param}'.format(param=grid_search.best_params_))
print('test_report:{test_report}'.format(test_report=test_report))

feature_names num:17500,the best score0.7841857142857143
the best param{'blb_clf__alpha': 0.01, 'selectKBest__k': 10000, 'tf_transformer__use_idf': True}
test_report:             precision    recall  f1-score   support

         餐饮       0.87      0.78      0.82       651
       餐饮美食       0.87      0.71      0.78      3463
         地名       0.64      0.49      0.56        99
       公共设施       0.61      0.99      0.76      4841
       公司企业       0.81      0.64      0.72       861
         购物       0.84      0.78      0.81      3183
       交通运输       0.88      0.71      0.78      7784
       教育培训       0.87      0.59      0.70       232
       教育文化       0.98      0.97      0.98       677
       金融保险       0.64      0.38      0.48       235
       金融银行       0.88      0.77      0.82      1961
       景点游玩       0.91      0.89      0.90      1757
       酒店住宿       0.67      0.62      0.64       161
       居民服务       0.50      0.58      0.54        12
       科研机构       0.84      0.78      

In [11]:
train

Unnamed: 0,query,category,data,label
28195,龙江村,地名,龙江村,3
66350,御祥,购物场所,御祥,6
84263,细屋,地名,细屋,3
10179,三多庄,地名,三多庄,3
90283,御景园,房地产,御景园,4
97267,天鸿展视界北门,房地产,天鸿 展 视界 北门,4
86183,万机油汽车维修服务中心,汽车服务,万 机油 汽车 维修 服务 中心,11
22922,秦家门,地名,秦家门,3
34393,龙岭村,地名,龙岭村,3
99716,新兴县水台镇奄村卫生站,医疗卫生,新兴县 水台镇 奄村 卫生站,16
