In [19]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack

dir_all_data='data/train.tsv'

In [2]:
#读取数据 
data_all = pd.read_csv(dir_all_data, sep='\t')
print("data_all.shape: ", data_all.shape)    #(156060, 4)
print("data_all.keys: ", data_all.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
print(data_all.head(2))

data_all.shape:  (156060, 4)
data_all.keys:  Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')
   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   

   Sentiment  
0          1  
1          2  


In [3]:
#取出要处理的列
x_all = data_all['Phrase']
y_all = data_all['Sentiment']
print(x_all.shape)   #(156060,)
print(y_all.shape)   #(156060,)

(156060,)
(156060,)


In [4]:
#划分验证集、测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
print(x_train.shape, x_val.shape, x_test.shape)   #(93636,) (31212,) (31212,)

(93636,) (31212,) (31212,)


In [13]:
#提取文本计数特征 -- 每个单词的数量
#对文本的单词进行计数，包括文本的预处理, 分词以及过滤停用词
count_vect = CountVectorizer()  
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)
print(x_train_counts.shape, x_test_counts.shape)  #(93636, 15188) (31212, 15188)  矩阵(句子-词汇）的维度，词表大小15188
#在词汇表中一个单词的索引值对应的是该单词在整个训练的文集中出现的频率。
print(count_vect.vocabulary_.get('good'))    #5808     count_vect.vocabulary_是一个词典：word-id
# x_train_counts.toarray()

(93636, 15202) (31212, 15202)
5808


array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
#提取TF-IDF特征 -- word级别的TF-IDF
tfidf_transformer = TfidfVectorizer(analyzer='word', max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_word = tfidf_transformer.transform(x_train)
x_test_tfidf_word = tfidf_transformer.transform(x_test)
print(x_train_tfidf_word.shape, x_test_tfidf_word.shape)

(93636, 15202) (31212, 15202)


In [17]:
#提取TF-IDF特征 - ngram级别的TF-IDF
#将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
tfidf_transformer = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_ngram = tfidf_transformer.transform(x_train)
x_test_tfidf_ngram = tfidf_transformer.transform(x_test)
print(x_train_tfidf_ngram.shape, x_test_tfidf_ngram.shape)

(93636, 50000) (31212, 50000)


In [18]:
#合并特征（特征组合与特征选择）
train_features = x_train_counts
test_features = x_test_counts
train_features = hstack([x_train_counts, x_train_tfidf_word, x_train_tfidf_ngram])
test_features = hstack([x_test_counts, x_test_tfidf_word, x_test_tfidf_ngram])
print(train_features.shape)   #特征的最终维度

(93636, 80404)


In [30]:
#训练分类器

#逻辑回归
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, 
                         solver='sag', #优化算法：liblinear、lbfgs、newton-cg、sag
                         multi_class='multinomial' #分类方式：multinomial、ovr
)

#朴素贝叶斯
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(train_features, y_train)


#SGDClassifier是一系列采用了梯度下降来求解参数的算法的集合，默认是SVM
# from sklearn.linear_model import SGDClassifier
# clf = SGDClassifier(alpha=0.001,
#                     loss='log',    #hinge代表SVM，log是逻辑回归
#                     early_stopping=True,
#                     eta0=0.001,
#                     learning_rate='adaptive', #constant、optimal、invscaling、adaptive
#                     max_iter=100 
#                    )

In [31]:
#打乱数据，训练
from sklearn.utils import shuffle

train_features, y_train = shuffle(train_features, y_train)
clf.fit(train_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
#测试过程
predict = clf.predict(test_features)
#测试集的评估
print(np.mean(predict == y_test))

0.6570549788542868
