# 文本分类-情感分析



In [1]:
from matplotlib import pyplot as plt
import jieba # 分词
import re # 正则
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np



In [8]:
def read_data(path, is_pos=None):
    """
    给定文件的路径，读取文件
    path: path to the data
    is_pos: 是否数据是postive samples. 
    return: (list of review texts, list of labels) 
    """
    reviews, labels  = [], []
    with open(path, 'r') as file:
        review_start  = False
        review_text = []
        for line in file:
            line = line.strip()
            if not line: continue
            if not review_start and line.startswith("<review"):
                review_start = True
                if "label" in line:
                    labels.append(int(line.split('"')[-2]))
                continue                
            if review_start and line == "</review>":
                review_start = False
                reviews.append(" ".join(review_text))
                review_text = []
                continue
            if review_start:
                review_text.append(line)
    if is_pos:
        labels = [1]*len(reviews)
    elif not is_pos is None:
        labels = [0]*len(reviews)
    return reviews, labels


def process_file():
    """
    读取训练数据和测试数据，并对它们做一些预处理
    """    
    train_pos_file = "data_sentiment/train.positive.txt"
    train_neg_file = "data_sentiment/train.negative.txt"
    test_comb_file = "data_sentiment/test.combined.txt"
    
    # 读取文件部分，把具体的内容写入到变量里面
    train_pos_cmts, train_pos_lbs = read_data(train_pos_file, True)
    train_neg_cmts, train_neg_lbs = read_data(train_neg_file, False)
    train_comments = train_pos_cmts + train_neg_cmts
    train_labels = train_pos_lbs + train_neg_lbs
    test_comments, test_labels = read_data(test_comb_file)
    return train_comments, train_labels, test_comments, test_labels
train_comments, train_labels, test_comments, test_labels = process_file()

In [9]:
#训练数据和测试数据大小
print(len(train_comments),len(test_comments))
print(train_comments[1],train_labels[1])

8064 2500
手感超好，而且黑色相比白色在转得时候不容易眼花，找童年的记忆啦。 1


In [11]:
def load_stopwords(path):
    """
    从外部文件中导入停用词
    """
    stopwords = set()
    with open(path, 'r') as in_file:
        for line in in_file:
            stopwords.add(line.strip())
    return stopwords


def clean_non_chinese_symbols(text):
    """
    处理非中文字符
    """
    text = re.sub('[!！]+', "!", text)
    text = re.sub('[?？]+', "?", text)
    text = re.sub("[a-zA-Z#$%&\'()*+,-./:;：<=>@，。★、…【】《》“”‘’[\\]^_`{|}~]+", " UNK ", text)
    return re.sub("\s+", " ", text)  

def clean_numbers(text):
    """
    处理数字符号  128  190  NUM 
    """
    return re.sub("\d+", ' NUM ', text)

def preprocess_text(text, stopwords):
    """
    文本的预处理过程
    """
    text = clean_non_chinese_symbols(text)
    text = clean_numbers(text)
    text = " ".join([term for term in jieba.cut(text) if term and not term in stopwords])
    return text

In [12]:
path_stopwords = "./data_sentiment/stopwords.txt"
stopwords = load_stopwords(path_stopwords)

In [13]:
# 对于train_comments, test_comments进行字符串的处理，几个考虑的点：
#   1. 停用词过滤
#   2. 去掉特殊符号
#   3. 去掉数字（比如价格..)
#   需要注意的点是，由于评论数据本身很短，如果去掉的太多，很可能字符串长度变成0.
train_comments_new = [preprocess_text(comment,stopwords)for comment in train_comments]
test_comments_new = [preprocess_text(comment,stopwords)for comment in test_comments]
print(train_comments_new[1],test_comments_new[2])

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/fq/0c740bxn4cv9b75d33p_zb800000gn/T/jieba.cache
Loading model cost 1.288 seconds.
Prefix dict has been built succesfully.


手感 超好   UNK   黑色 相比 白色 转得 不 容易 眼花   UNK   找 童年 记忆   UNK   袁阔成 先生 当今 评书 界 泰斗   UNK   十二 金钱 镖 代表作


In [23]:
#利用tf-idf从文本中提取特征,转换成向量形式
tfidf = TfidfVectorizer()
x_train = tfidf.fit_transform(train_comments_new) #训练数据的特征
y_train = train_labels #训练数据的标签
x_test = tfidf.transform(test_comments_new)
y_test = test_labels
print(x_train)

  (0, 5200)	0.3012395527334206
  (0, 15250)	0.20661027286123507
  (0, 11999)	0.23051897504645855
  (0, 18281)	0.32722494951810027
  (0, 8602)	0.5013023568192346
  (0, 9574)	0.2219978911106752
  (0, 20294)	0.20211546467356176
  (0, 18550)	0.24678500238628445
  (0, 1)	0.047949259530463625
  (0, 10618)	0.3845315241159845
  (0, 4011)	0.30715504903861346
  (0, 22983)	0.24393297813553635
  (1, 1)	0.14922217841421276
  (1, 10582)	0.2842243363370056
  (1, 20321)	0.32579457084637553
  (1, 23040)	0.294164548754424
  (1, 16216)	0.29058894421029297
  (1, 16002)	0.2889070654128061
  (1, 20540)	0.3815109596433837
  (1, 8122)	0.23071460346703676
  (1, 16422)	0.3988982816378282
  (1, 16937)	0.28728866910276246
  (1, 19500)	0.30972681181022943
  (3, 2279)	0.6372913295541118
  (3, 2873)	0.7706229696000194
  :	:
  (8062, 3091)	0.41008521989983115
  (8062, 1398)	0.3359737607019133
  (8062, 12774)	0.35849538595302105
  (8063, 1)	0.16879153341921777
  (8063, 3520)	0.09382100806643663
  (8063, 12790)	0.13438

In [26]:
#用贝叶斯模型训练数据
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


clf = MultinomialNB()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("accuracy on test data:", accuracy_score(y_test, y_pred))

accuracy on test data: 0.6368


In [29]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

accuracy on test data:  0.524


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

normalizer = StandardScaler()  # data is no longer sparse
x_train_normalized = normalizer.fit_transform(x_train.toarray())
x_test_normalized = normalizer.transform(x_test.toarray())

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train_normalized, y_train)

#Now we can predict prices:
y_pred = knn.predict(x_test_normalized)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))