In [1]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
###############
#文件结构
#├── test
#│   ├── neg
#│   └── pos
#└── train
#    ├── neg
#    ├── pos
#    └── unsup
################
for s in ('test','train'):
    for l in ('pos','neg'):
        path = '/home/lzjqsdd/Documents/aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r') as infile:
                txt = infile.read()
            df = df.append([[txt,labels[l]]],ignore_index=True)
            pbar.update()
df.columns = ['review','sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:06:46


In [2]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/home/lzjqsdd/Documents/aclImdb/movie_data.csv')

In [3]:
df = pd.read_csv('/home/lzjqsdd/Documents/aclImdb/movie_data.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,11841,"As a French, i found it very pleasant to be ab...",1
1,19602,I watched this movie and all I can say is this...,0
2,45519,"In this forgettable trifle, the 40-ish Norma S...",0


In [4]:
# clean text data
# 文本中包含html标记
# 使用正则表达式进行提取
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+', ' ', text.lower())+ ''.join(emoticons).replace('-', '')
    return text

preprocessor(df.loc[0,'review'][-100:])

'lieve in the characters etc esthetically and musically it s a success too go see it if you can '

In [5]:
df['review'] = df['review'].apply(preprocessor) #对每个记录均做了处理。

In [6]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [7]:
# 把单词转换为原始形式
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [8]:
# 去除停用词
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lzjqsdd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]


['runner', 'like', 'run', 'run', 'lot']

In [10]:
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values
X_test.shape

(25000,)

In [11]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [
    {
        'vect__ngram_range':[(1,1)],
        'vect__tokenizer':[tokenizer,tokenizer_porter],
        'clf__penalty':['l1','l2'],
        'clf__C':[1.0,10.0,100.0]
    },
    {'vect__ngram_range': [(1,1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer,tokenizer_porter],
     'vect__use_idf':[False],
     'vect__norm':[None],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]
    }
]

lr_tfidf = Pipeline([
        ('vect',tfidf),
        ('clf',LogisticRegression(random_state=0))
    ])

gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',cv=5,
                          verbose=1,n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 39.7min finished


Best parameter set: {'vect__ngram_range': (1, 1), 'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__tokenizer': <function tokenizer at 0x7feb28651620>} 


In [12]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.893


In [13]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test,y_test))

Test Accuracy: 0.898


## On-Line算法

In [19]:
import numpy as np
import re
from nltk.corpus import stopwords
stop= stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text = re.sub('[\W]+',' ',text.lower())+' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path,'r') as csv:
        next(csv) #跳过头
        for line in csv:
            text,label = line[:-3],int(line[-2]) #因为最后有个换行符
            yield text,label

In [20]:
#next(stream_docs(path='/home/lzjqsdd/Documents/aclImdb/movie_data.csv'))

def get_minibatch(doc_stream,size):
    docs,y = [],[]
    try:
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None
    return docs,y

In [21]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)

clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
doc_stream  = stream_docs(path='/home/lzjqsdd/Documents/aclImdb/movie_data.csv')

In [22]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train,y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:38


In [23]:
X_test,y_test = get_minibatch(doc_stream,size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test,y_test))

Accuracy: 0.868


### Chapter9 嵌入到web中

In [25]:
import pickle
import os
dest = os.path.join('/home/lzjqsdd/Documents/aclImdb/movieclassifier','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4)
