In [1]:
import pandas as pd
import jieba as jb
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer 
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
root = 'd:/tempdata/'
file_name = '163_news_data2018_01_08_08_00_00.csv'
file = root + file_name
df = pd.DataFrame.from_csv(file,encoding='gbk', index_col=None)
df = df[['一级目录','二级目录','标题','内容']]
df = df.dropna()

In [3]:
#df.head()
#df.loc(291)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4393 entries, 0 to 4604
Data columns (total 4 columns):
一级目录    4393 non-null object
二级目录    4393 non-null object
标题      4393 non-null object
内容      4393 non-null object
dtypes: object(4)
memory usage: 171.6+ KB


In [4]:
df.groupby(['一级目录','二级目录']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,标题,内容
一级目录,二级目录,Unnamed: 2_level_1,Unnamed: 3_level_1
体育,CBA,293,293
体育,NBA,244,244
体育,中国足球,295,295
体育,国际足球,289,289
娱乐,明星,33,33
娱乐,电影,248,248
娱乐,电视,290,290
娱乐,综艺,114,114
娱乐,音乐,139,139
时事新闻,国内,557,557


In [3]:
y1, y2, title, content = [df.一级目录, df.二级目录, df.标题, df.内容]

In [4]:
#加载停用词
f_stop_words = open(root + 'stop_words.txt', encoding='utf-8')
stop_words = f_stop_words.readline().split('\t')
f_stop_words.close()

In [5]:
title_cuted = [' '.join(jb.cut(x)) for x in title.values ]
content_cuted = [' '.join(jb.cut(x)) for x in content.values ]

#tfidf title
X1 = TfidfVectorizer(stop_words=stop_words).fit_transform(title_cuted)
#count title
X2 = CountVectorizer(stop_words=stop_words).fit_transform(title_cuted)

#tiidf content
X3 = TfidfVectorizer(stop_words=stop_words).fit_transform(content_cuted)
#count content
X4 = CountVectorizer(stop_words=stop_words).fit_transform(content_cuted)

#binary
X5 = X4>0

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\GANGZ\AppData\Local\Temp\jieba.cache
Loading model cost 2.282 seconds.
Prefix dict has been built succesfully.


In [13]:
#使用标题训练一级目录
X_train, X_val, y_train, y_val = train_test_split(X2,y1,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
print('MultinomialNB, title, y1:', score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
print('LogisticRegression, title, y1:', score)

MultinomialNB, title, class one: [ 0.90938511  0.8961039   0.91544715  0.9233279   0.91353997]
LogisticRegression, title, class one: [ 0.9012945   0.87012987  0.88292683  0.86949429  0.88091354]


In [14]:
#使用标题训练二级目录
X_train, X_val, y_train, y_val = train_test_split(X2,y2,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
print('MultinomialNB, title, y2:', score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
print('LogisticRegression, title, y2:', score)

MultinomialNB, title, class one: [ 0.72873194  0.73344103  0.747557    0.73278689  0.73563218]
LogisticRegression, title, class one: [ 0.70626003  0.73344103  0.73778502  0.7295082   0.74220033]


In [12]:
#使用内容训练一级目录
X_train, X_val, y_train, y_val = train_test_split(X4,y1,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('MultinomialNB, content, y1:', cv_score, pre_score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('LogisticRegression, content, y1:', cv_score, pre_score)

MultinomialNB, title, class one: [ 0.92071197  0.93668831  0.93658537  0.93311582  0.94290375] 0.927921092564
LogisticRegression, title, class one: [ 0.94012945  0.94805195  0.93495935  0.93474715  0.93474715] 0.933990895296


In [27]:
#使用内容训练二级目录
X_train, X_val, y_train, y_val = train_test_split(X4,y2,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('MultinomialNB, content, y2:', cv_score, pre_score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('LogisticRegression, content, y2:', cv_score, pre_score)

 


MultinomialNB, title, class one: [ 0.79133226  0.80775444  0.79967427  0.79508197  0.81116585] 0.808801213961
LogisticRegression, title, class one: [ 0.82985554  0.84006462  0.83061889  0.8147541   0.81116585] 0.829286798179
max_df=0.3,LogisticRegression, content, class one: [ 0.82664526  0.84491115  0.82899023  0.81639344  0.8226601 ] 0.825493171472


In [28]:
#限制max_df 优化
for max_df in [0.1,0.2,0.3,0.5,0.8,1.0]:
#
    print(max_df)
    X4 = CountVectorizer(stop_words=stop_words, max_df=max_df).fit_transform(content_cuted)
    X_train, X_val, y_train, y_val = train_test_split(X4,y2,test_size=0.3,random_state=42)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
    pre_score = clf.score(X=X_val, y=y_val)
    print('LogisticRegression, content, class one:', cv_score, pre_score)
      

0.1
LogisticRegression, content, class one: [ 0.82182986  0.84168013  0.82410423  0.81967213  0.81444992] 0.820940819423
0.2
LogisticRegression, content, class one: [ 0.81861958  0.84491115  0.82084691  0.81803279  0.81609195] 0.827010622155
0.3
LogisticRegression, content, class one: [ 0.82664526  0.84491115  0.82899023  0.81639344  0.8226601 ] 0.825493171472
0.5
LogisticRegression, content, class one: [ 0.82504013  0.84329564  0.8257329   0.81147541  0.81773399] 0.830804248862
0.8
LogisticRegression, content, class one: [ 0.83306581  0.83844911  0.82899023  0.81311475  0.81280788] 0.828528072838
1.0
LogisticRegression, content, class one: [ 0.82985554  0.84006462  0.83061889  0.8147541   0.81116585] 0.829286798179


In [17]:
#使用内容训练一级目录  X5，binary
X_train, X_val, y_train, y_val = train_test_split(X5,y1,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('MultinomialNB, title, class one:', cv_score, pre_score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('LogisticRegression, title, class one:', cv_score, pre_score)


MultinomialNB, title, class one: [ 0.92718447  0.91720779  0.91869919  0.92169657  0.92822186] 0.926403641882
LogisticRegression, title, class one: [ 0.94983819  0.95292208  0.95934959  0.95106036  0.95921697] 0.959028831563


In [18]:
#使用内容训练二级目录 X5，binary
X_train, X_val, y_train, y_val = train_test_split(X5,y2,test_size=0.3,random_state=42)

#
clf = MultinomialNB()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('MultinomialNB, title, class one:', cv_score, pre_score)

#
clf = LogisticRegression()
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('LogisticRegression, title, class one:', cv_score, pre_score)


MultinomialNB, title, class one: [ 0.75441413  0.75282714  0.747557    0.75081967  0.75369458] 0.764795144158
LogisticRegression, title, class one: [ 0.85393258  0.85137318  0.85504886  0.84590164  0.8456486 ] 0.852807283763


In [6]:
#使用内容训练二级目录 X5，binary, multinomial
X_train, X_val, y_train, y_val = train_test_split(X5,y2,test_size=0.3,random_state=42)


clf = LogisticRegression(multi_class='multinomial',solver='lbfgs', C=10)
clf.fit(X_train, y_train)
cv_score = cross_val_score(cv=5, estimator=clf, X=X_train, y=y_train)
pre_score = clf.score(X=X_val, y=y_val)
print('LogisticRegression, title, class one:', cv_score, pre_score)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB, title, class one: [ 0.75441413  0.75282714  0.747557    0.75081967  0.75369458] 0.764795144158


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

LogisticRegression, title, class one: [ 0.84911717  0.84652666  0.86482085  0.83442623  0.85714286] 0.858877086495
