In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report

In [2]:
category_names = ['餐饮','餐饮美食','地名','公共设施','公司企业','购物','交通运输','教育培训','教育文化','金融保险','金融银行','景点游玩','酒店住宿','居民服务','科研机构','农林牧渔业','批发零售','其他','汽车','汽车服务','商业设施服务','生活服务','休闲娱乐','医疗','医疗服务','运动休闲','住宿']
sogou_category_names = ['宾馆饭店','餐饮服务','场馆会所','地名','房地产','公司企业','购物场所','交通出行','金融银行','旅游景点','其它','汽车服务','体育场馆','新闻媒体','休闲娱乐','学校科研','医疗卫生','邮政电信','政府机关']

category_name_dict={}
for i,v in enumerate(sogou_category_names):
    category_name_dict[v]=i

In [3]:
def covert_label(category):
    return category_name_dict[category]

In [4]:
def calclate_dscore(docs_values,len_doc=3):    
    dw = len(docs_values)
    pw = dw/len_doc
    max_freq = np.max(docs_values)
    min_freq = np.min(docs_values)
    tw = max_freq/min_freq
    score = (1+pw*tw)*dw
    return score

In [5]:
df = pd.read_csv('D:\\projectlist\\navCategory\\segment_nav_standard_poi',sep='\t',names=['query','category','data'],encoding='gb18030')
df = df.dropna()

In [None]:
df.head()

In [6]:
vectorizer=CountVectorizer()
df['label'] = df['category'].apply(func=covert_label)
train,test = train_test_split(df,train_size  = 0.7)
term_matrix = vectorizer.fit_transform(train['data'])
feature_names=vectorizer.get_feature_names()
transpose_term_matrix = term_matrix.transpose()
transpose_term_lil_data = transpose_term_matrix.tolil().data
doc_len = len(feature_names)

In [7]:
feature_names_dict={}
for i,v in enumerate(feature_names):
    score = calclate_dscore(transpose_term_lil_data[i],doc_len)
    feature_names_dict[v]=score

In [8]:
sort_feature_names_dict = sorted(feature_names_dict.items(),key=lambda d:d[1],reverse=True)

In [20]:
for k in range(50000,60000,2500):
    filter_feature_names = [ v[0] for i,v in enumerate(sort_feature_names_dict) if i<=k]
    
    train_y = train.label.values
    test_y = test.label.values


    step1 = ('count_vectorizer',CountVectorizer(vocabulary=filter_feature_names))
    step2=('tf_transformer',TfidfTransformer())#该类会统计每个词语的tf-idf权值
    step3 =('blb_clf',BernoulliNB())

    pipeline = Pipeline(steps=[step1, step2,step3])

    parameters = {
        'blb_clf__alpha': [0.5],
    }
    
    grid_search = GridSearchCV(pipeline, param_grid=parameters)
    grid_search.fit(train['data'],train_y)
    test_y_pre = grid_search.predict(test['data'])
    test_report = classification_report(test_y, test_y_pre, target_names=category_names)
    
    print('feature_names num:{num},the best score{score}'.format(num=k,score=grid_search.best_score_))
    print('the best param{param}'.format(param=grid_search.best_params_))
    print('test_report:{test_report}'.format(test_report=test_report))

feature_names num:50000,the best score0.8796293101444029
the best param{'blb_clf__alpha': 0.5}
test_report:             precision    recall  f1-score   support

         餐饮       0.88      0.91      0.90    159883
       餐饮美食       0.92      0.86      0.89    848433
         地名       0.70      0.79      0.74     26867
       公共设施       0.82      0.99      0.90   1217769
       公司企业       0.85      0.78      0.81    212439
         购物       0.86      0.82      0.84    806331
       交通运输       0.90      0.85      0.87   1949891
       教育培训       0.94      0.67      0.78     63146
       教育文化       0.99      0.98      0.98    178862
       金融保险       0.66      0.49      0.56     62261
       金融银行       0.91      0.83      0.87    512019
       景点游玩       0.94      0.93      0.93    445851
       酒店住宿       0.76      0.78      0.77     37593
       居民服务       0.62      0.71      0.66      5850
       科研机构       0.87      0.85      0.86    150319
      农林牧渔业       0.90      0.92      0.91  

In [14]:
train_y = train.label.values
test_y = test.label.values
step0 = ('count_vectorizer',CountVectorizer())
step1 = ('selectKBest',SelectKBest(chi2))
step2=('tf_transformer',TfidfTransformer())#该类会统计每个词语的tf-idf权值
step3 =('blb_clf',BernoulliNB())

pipeline = Pipeline(steps=[step0,step1, step2,step3])

parameters = {
    'selectKBest__k':[2500,5000,7500,10000],
    'tf_transformer__use_idf': [True, False],
    'blb_clf__alpha': [0.01,0.5],
}

grid_search = GridSearchCV(pipeline, param_grid=parameters)
grid_search.fit(train['data'],train_y)
test_y_pre = grid_search.predict(test['data'])
test_report = classification_report(test_y, test_y_pre, target_names=category_names)

print('feature_names num:{num},the best score{score}'.format(num=k,score=grid_search.best_score_))
print('the best param{param}'.format(param=grid_search.best_params_))
print('test_report:{test_report}'.format(test_report=test_report))

feature_names num:17500,the best score0.7841857142857143
the best param{'blb_clf__alpha': 0.01, 'selectKBest__k': 10000, 'tf_transformer__use_idf': True}
test_report:             precision    recall  f1-score   support

         餐饮       0.87      0.78      0.82       651
       餐饮美食       0.87      0.71      0.78      3463
         地名       0.64      0.49      0.56        99
       公共设施       0.61      0.99      0.76      4841
       公司企业       0.81      0.64      0.72       861
         购物       0.84      0.78      0.81      3183
       交通运输       0.88      0.71      0.78      7784
       教育培训       0.87      0.59      0.70       232
       教育文化       0.98      0.97      0.98       677
       金融保险       0.64      0.38      0.48       235
       金融银行       0.88      0.77      0.82      1961
       景点游玩       0.91      0.89      0.90      1757
       酒店住宿       0.67      0.62      0.64       161
       居民服务       0.50      0.58      0.54        12
       科研机构       0.84      0.78      

In [16]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17591957 entries, 16284159 to 7576061
Data columns (total 4 columns):
query       object
category    object
data        object
label       int64
dtypes: int64(1), object(3)
memory usage: 928.9 MB


In [17]:
train.memory_usage(deep=True)

Index        140735656
query       1675689119
category    1566118766
data        1745663751
label        140735656
dtype: int64

In [18]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25131368 entries, 0 to 25131372
Data columns (total 4 columns):
query       object
category    object
data        object
label       int64
dtypes: int64(1), object(3)
memory usage: -1062776980.0 bytes


In [19]:
train

Unnamed: 0,query,category,data,label
16284159,毛呼都格,地名,毛 呼都格,3
17923564,凤凰化妆折扣店,购物场所,凤凰 化妆 折扣店,6
2452286,黟县残疾人联合会,公司企业,黟县 残疾人 联合会,5
3222725,精灵幼儿园,学校科研,精灵 幼儿园,15
22255698,正宗福建千里香馄饨蒸饺王,餐饮服务,正宗 福建 千里香 馄饨 蒸饺王,1
7602915,鑫源茶业,购物场所,鑫源 茶业,6
20625363,蚂蚁蚂蚁音乐酒吧红旗街店,餐饮服务,蚂蚁 蚂蚁 音乐 酒吧 红旗街 店,1
18096822,庆海家电城,购物场所,庆海 家电城,6
7038358,荥阳市广武镇第四小学,学校科研,荥阳市 广武镇 第四 小学,15
21164401,日升家具城,购物场所,日升 家具城,6
