In [3]:
import pyprind
import pandas as pd
import os 
pbar = pyprind.ProgBar(50000) # 重复50，000次，也就是读文件的次数
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path ='./aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:25


In [4]:
# 由于classlabel是按照顺序来组合排列的，所以在这里我们打乱这个顺序
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv',index=False)

In [5]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,I was fortunate enough to catch a midnight scr...,1
1,"Oh wow, the character shares my name first nam...",0
2,Just a few words.... This movie really sucks. ...,0


### bag-of-words model

In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer() # 参数 ngram_range,当n_gram_range=(2,2)时候，表明的是2-gram
docs = np.array(['The sun is shining',\
                 'The weather is sweet',\
                 'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray()) # 特征的0位置代表的是'and'，显示的是原始的词频。1-gram

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


#### tf-idf(term frequency-inverse document frequency)
tf-idf = term frequency * inverse document frequency=$tf(t,d)*idf(t,d)$  
$idf(t,d)=log\frac{n_{d}}{1+df(d,t)}$  
$n_{d}$代表的是文件的总数，$df(d,t)$是指包含t的文件d的个数，分母1是可选择项，这里允许df=0存在，log是为了让低频的文件不会有太大的权重。
scikit-learn里面的实现不一样：  
$idf(t,d)=log\frac{1+ n_{d}}{1+df(d,t)}$    
$tf-idf(t,d) = tf(t,d)*(idf(t,d)+1)$  
对最终的tf-idf进行范数为2的归一化处理

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


In [12]:
df.loc[0,'review'][-50:]

' />Take it as you want to....<br /><br />- the fed'

In [30]:
# 移除HTML符号，只保留表情字符
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    #if len(emoticons)>0:
    #    print(emoticons)
    text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
    return text

In [31]:
df['review'] = df['review'].apply(preprocessor)

In [35]:
# 将documents变换成tokens
def tokenizer(text):
    return text.split()

# 利用次跟方法（word stemming),也就是（Porter stemmer算法）来完成
# 将所有的词返回它的词根
# snowball stemmer 和Lancaster stemmer比porter stemmer速度更快，在nltk里面都有实现。
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# stop-words：平时常见的词汇，基本上不带有任何有效新消息。
# 在这里移除stop-words
# nltk包里面有127个stop-words。
import nltk
nltk.download('stopwords')

# 下载完成stop-words之后，选用英语库的stop-word set
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to /home/lily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [38]:
# 利用logsitic regression model做文件情感分类
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],\
              'vect__stop_words':[stop,None],\
               'vect__tokenizer': [tokenizer,tokenizer_porter],\
               'clf__penalty': ['l1', 'l2'],\
               'clf__C': [1.0, 10.0, 100.0]},\
              {'vect__ngram_range': [(1,1)],\
               'vect__stop_words': [stop, None],\
               'vect__tokenizer': [tokenizer, tokenizer_porter],\
               'vect__use_idf':[False],\
               'vect__norm':[None],\
               'clf__penalty': ['l1', 'l2'],\
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid, scoring='accuracy',cv=5,verbose=1,n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 53.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='ac

In [40]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fe144762f28>} 
CV Accuracy: 0.894
Test Accuracy: 0.898


在文本分类领域，一个一直比较受欢迎的方法是朴素贝叶斯方法，在垃圾邮件分类领域的受欢迎程度很高。  
朴素贝叶斯很容易执行，计算也比较有效，比起其他的方法，在小数据集上面更加有效。  
详情查看论问Naive Bayes and Text Classification I-nitroduction and Theory.

### 下面讨论大数据的情况，在线算法以及out-of-core learning
out-of core learning: 
在计算机资源有限的情况下面，处理大数据的方法。  
类似于SGD,这里用partial_fit(位于SGDClassifier)让logistic regression模型直接从磁盘中读取文件用minibatches文件来训练。

In [62]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    # 用来清洁没有处理的原始文件
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
    tokenizer = [w for w in text.split() if w not in stop]
    return tokenizer

def stream_docs(path):
    # 生成函数，一次读取一个文件
    with open(path, 'r') as csv:
        next(csv) # skip header， next属于迭代函数，取能够迭代的对象的下一个值
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

# 检查stream_docs函数
# print(next(stream_docs(path='./movie_data.csv')))

def get_minibatch(doc_stream,size):
    # 从stream_docs中获取一个文件流， 通过size参数返回特定数目的文件
    docs, y =[],[]
    try:
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

# 在这里 CountVectorizer和TfidfVectorizer都不能用，因为需要把所有的数据都存储到内存当中
# 这里采用 HashingVectorizer， 这个word embedding的方法是不依赖于数据的
# 通过32为 MurmurHash3算法来利用 Hashing trick
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21,\
                        preprocessor = None, tokenizer = tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45) # initialized the progress bar object with 45 iterations
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()
    



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:22


In [63]:
X_test,y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f'%clf.score(X_test,y_test))
clf = clf.partial_fit(X_test,y_test) # 利用最后的test数据来改进模型。

Accuracy: 0.869




前面展示的是词袋模型， 虽然说它在文本分类领域用得还是比较频繁，但是它并没有考虑句子的结构和语法。  
词袋模型的一个比较受欢迎的延伸是 Latent Dirichlet allocation，考虑了句子的潜在词义。  

现在一个比较受欢迎的方法是google在2013年提出的word2vec模型。 这是一个基于NN的非监督学习：  
理论： 将相似词汇的意思放到同一个类别当中， 通过有效的矢量空间， 一个模型可以重新产生特定的词汇基于简单的矢量数学运算，例如：  
king-man+women=queen

###  下面讨论将Machine learning与Web application结合

In [66]:
# 序列化拟合scikit-learn estimators
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)# protocol=4 4是最近并且最有效的pickle protocol
# 实际中， logistic regression model中包含NumPy arrays， 最能够有效的序列化NumPy arrays 的是joblib 库

In [None]:
# 因为HashingVectorizer不需要拟合，所以不需要存储它，
# 相应的，我们创建一个Python的脚本文件，在这里，我们可以将vectorizer引进现在的Python session
# 将以下的code存储到movieclassifier文件中命名为vectorizer.py

from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.join(cur_dir,'pkl_objects','stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
    tokenizer = [w for w in text.split() if w not in stop]
    return tokenizer

vect = HashingVectorizer(decode_error='ignore', n_features=2**21,\
                        preprocessor = None, tokenizer = tokenizer)

In [None]:
# 以下代码运行需要注意文件的位置！！！需要在movieclassifier文件下运行
# test if we can use the Python object

import pickle
import re
import os
from vectorizer import vect
clf = pickle.load(open(os.path.join('pkl_objects','classifier.pkl'),'rb'))

import numpy as np
label = {0:'negative',1:'positive'}
example = ['I love this movie']
X = vect.transform(example)
print('Prediction:%s\nProbability:%.2f%%' %\
     (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)*100)))

In [70]:
path = os.path.join('1st_flask_app_1','templates')
if not os.path.exists(path):
    os.makedirs(path)

In [1]:
import os
directory = os.path.join('1st_flask_app_2','templates')
if not os.path.exists(directory):
    os.makedirs(directory)

In [2]:
direc = os.path.join('1st_flask_app_2','static')
if not os.path.exists(direc):
    os.makedirs(direc)

In [4]:
direc = os.path.join('movieclassifier','templates')
if not os.path.exists(direc):
    os.makedirs(direc)