In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import platform
import datetime,pytz

root_ = '/content/drive/My Drive/colab/' if platform.system() == 'Linux' else '/Users/love/Test/'

WeiboSentiment_ = os.path.join(root_, 'WeiboSentiment')
if not os.path.exists(WeiboSentiment_):
    os.makedirs(WeiboSentiment_)

model_ = os.path.join(WeiboSentiment_, 'model')
if not os.path.exists(model_):
    os.makedirs(model_)


import jieba
import re
import numpy as np

def tokenize(text):
    """
    带有语料清洗功能的分词函数, 包含数据预处理, 可以根据自己的需求重载
    """
    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%} (地理定位, 微博话题等)
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    icons = re.findall("\[.+?\]", text)             # 提取出所有表情图标
    text = re.sub("\[.+?\]", "IconMark", text)      # 将文本中的图标替换为`IconMark`

    tokens = []
    for k, w in enumerate(jieba.lcut(text)):
        w = w.strip()
        if "IconMark" in w:                         # 将IconMark替换为原图标
            for i in range(w.count("IconMark")):
                tokens.append(icons.pop(0))
        elif w and w != '\u200b' and w.isalpha():   # 只保留有效文本
                tokens.append(w)
    return tokens


def load_curpus(path):
    """
    加载语料库
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = tokenize(content)             # 分词
            data.append((content, int(seniment)))
    return data


Mounted at /content/drive


#### 加载数据

In [2]:
import pandas as pd
train_data = load_curpus(os.path.join(WeiboSentiment_, 'train.txt'))
test_data = load_curpus(os.path.join(WeiboSentiment_, 'test.txt'))
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.057 seconds.
Prefix dict has been built successfully.


加载停用词

In [3]:
stopwords = []
with open(os.path.join(WeiboSentiment_, 'stopwords.txt'), "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

TfIdf

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
tfidf = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
tfidf_fit = tfidf.fit_transform(data_str)

  'stop_words.' % sorted(inconsistent))


加载之前训练好的FastText模型

In [5]:
!pip install gensim



In [6]:
from gensim.models import FastText
model = FastText.load(os.path.join(model_, 'model_100.txt'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


最多只保留Tf-Idf最高的前多少个词

In [7]:
key_words = 30

#### 用每个词的Tfidf作为权重, 对FastText词向量进行加权, 得到表征每个句子的向量

In [8]:
X_train, y_train = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_train.append(X)
        y_train.append(y)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  impor

In [9]:
X_test, y_test = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_test.append(X)
        y_test.append(y)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  import sys
  
  impor

### SVM

In [10]:
from sklearn import svm
clf = svm.SVC(C=1, class_weight={1: .95, 0: 1.})
clf.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight={0: 1.0, 1: 0.95},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
result = clf.predict(X_test)

In [12]:
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

print('acc:',accuracy_score(y_test, result))
print('pc',precision_score(y_test, result))
print('rc:',recall_score(y_test, result))
print('f1:',f1_score(y_test, result))

acc: 0.7672947967065614
pc 0.7694309216048346
rc: 0.77014198101319
f1: 0.7697862871058488
