In [13]:
import pandas as pd
import numpy as np
import jieba
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets

In [7]:
fname = 'C:/Users/MWHREVO/Desktop/人工智能与NLP/jupyters_and_slides/2019-summer/sqlResult_1558435.csv'

In [9]:
data = pd.read_csv(fname, encoding='gb18030')

In [10]:
data['source'] = data['source'].fillna('')
data_pos = data[data['source'].str.contains('新华')]
data_neg = data[data['source'].str.contains('新华') == 0]

N_sample = 6000
data_neg = data_neg.sample(n = (int)(N_sample/2))
data_pos = data_pos.sample(n = (int)(N_sample/2))

data_sampled = data_pos.append(data_neg)
print(len(data_sampled))


6000


In [15]:
attributes = ['content','feature', 'title','author','url']
attr_choosen = ['content']
def cut(string):
    return ' '.join(jieba.cut(re.sub(r'[^\w\s]',' ',string.strip())))
def preprocess(data):
    for attr in attributes:
        data[attr] = data[attr].fillna('').apply(cut)
    data['is_xinhua'] = np.where(data['source'].str.contains('新华'),1,0)
    #print(data)
    x_inputs = data.loc[:,attr_choosen]
    #print(x_inputs)
    y_inputs = data['is_xinhua'].values
    #print(y_inputs)
    return x_inputs,y_inputs
x_inputs, y_inputs = preprocess(data_sampled)       
#print(data_processed)

In [16]:
def vectorize(x_inputs, attr_choosen):
    vectorizer = TfidfVectorizer(max_features=5000, token_pattern=r"(?u)\b\w+\b", max_df = 1.0,
                                 stop_words = None,vocabulary = None )# (?u)进入re.u，unicode匹配模式，https://stackoverflow.com/questions/35043085/what-does-u-do-in-a-regex
    vec = None
    for attr in attr_choosen:
        vec_fea = vectorizer.fit_transform(x_inputs[attr].values)
        print("vocabulary\n",vectorizer.vocabulary_, len(vectorizer.vocabulary_))
        print("stop_words\n",vectorizer.stop_words_, len(vectorizer.stop_words_))
        #typeof(vec_fea)
        #np_vec = np.array(vec_fea)
        np_vec = vec_fea.toarray()
        #print(np_vec)
        if vec is None:
            vec = np_vec
        else:
            vec = np.hstack((vec,np_vec))
        #print(attr, vec_fea.shape, np_vec.shape)
    return vec

x_vecs = vectorize(x_inputs, attr_choosen)
print(x_vecs.shape)

vocabulary
 {'新华社': 2818, '照片': 3409, '外代': 1763, '2017': 59, '年': 2199, '6': 113, '月': 2969, '2': 44, '日': 2867, 'n': 176, '二线': 587, '网球': 3945, '法网': 3271, '晋级': 2923, 'n6': 181, '1': 9, '比赛': 3197, '中': 436, '回球': 1606, '当日': 2312, '在': 1661, '法国巴黎': 3263, '举行': 520, '的': 3589, '法国': 3262, '公开赛': 967, '第二轮': 3804, '选手': 4609, '比': 3189, '0': 0, '战胜': 2487, '德国': 2358, '第三轮': 3798, '欧新': 3148, '４': 4977, '２': 4975, '３': 4976, '日电': 2876, '记者': 4275, '汉语': 3230, '桥': 3119, '世界': 385, '大学生': 1803, '中文': 462, '中国': 447, '驻': 4925, '大使馆': 1791, '本次': 3030, '主题': 515, '为': 487, '梦想': 3123, '未来': 3014, '分为': 1088, '演讲': 3374, '知识': 3662, '和': 1547, '表演': 4185, '个': 430, '环节': 3478, '来自': 3076, '大学': 1802, '学院': 1927, '语言': 4329, '等': 3807, '机构': 3046, '０': 4973, '多名': 1776, '参加': 1349, '了': 560, '们': 713, '从': 685, '各自': 1469, '生活': 3525, '讲述': 4280, '自己': 4046, '与': 369, '观众': 4218, '分享': 1089, '喜欢': 1582, '通过': 4627, '学习': 1919, '故事': 2757, '则': 1117, '希望': 2166, '成为': 2452, '一名': 209, 

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(
    x_vecs , y_inputs, train_size = 0.8, test_size=0.2)

In [19]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def get_performance(model, x_, y_):
    y_hat = model.predict(x_)
    print('f1_score is: {}'.format(f1_score(y_, y_hat)))
    print('accuracy is: {}'.format(accuracy_score(y_, y_hat)))
    print('percision is: {}'.format(precision_score(y_, y_hat)))
    print('recall is: {}'.format(recall_score(y_, y_hat)))
    #print('roc_auc is: {}'.format(roc_auc_score(y_, y_hat)))
    #print('confusion matrix: \n{}'.format(confusion_matrix(y_, y_hat, labels=[0, 1])))

In [20]:
def train_model(model):
    model.fit(x_train, y_train) # fit model
    return get_performance(model, x_test, y_test)

In [21]:
model = LogisticRegression()
res_score = train_model(model)



f1_score is: 0.9822926374650512
accuracy is: 0.9841666666666666
percision is: 1.0
recall is: 0.9652014652014652


In [22]:
model = LogisticRegression()
res_score = train_model(model)

f1_score is: 0.9822926374650512
accuracy is: 0.9841666666666666
percision is: 1.0
recall is: 0.9652014652014652




In [23]:
for i in np.linspace(0.1, 2, 5):
    model = LogisticRegression(C=i)
    res_score = train_model(model)
    print('*'*100)



f1_score is: 0.9590865842055185
accuracy is: 0.9641666666666666
percision is: 0.998019801980198
recall is: 0.9230769230769231
****************************************************************************************************
f1_score is: 0.9785247432306257
accuracy is: 0.9808333333333333
percision is: 0.9980952380952381
recall is: 0.9597069597069597
****************************************************************************************************




f1_score is: 0.9822926374650512
accuracy is: 0.9841666666666666
percision is: 1.0
recall is: 0.9652014652014652
****************************************************************************************************
f1_score is: 0.9832402234636871
accuracy is: 0.985
percision is: 1.0
recall is: 0.967032967032967
****************************************************************************************************
f1_score is: 0.9841860465116279
accuracy is: 0.9858333333333333
percision is: 1.0
recall is: 0.9688644688644689
****************************************************************************************************




In [24]:
model = MultinomialNB()
res_score = train_model(model)

f1_score is: 0.8118811881188119
accuracy is: 0.81
percision is: 0.7387387387387387
recall is: 0.9010989010989011


In [25]:
model = DecisionTreeClassifier(max_depth=50, max_features=3000)
res_score = train_model(model)

f1_score is: 0.995433789954338
accuracy is: 0.9958333333333333
percision is: 0.9927140255009107
recall is: 0.9981684981684982
