In [17]:
import pyprind
import pandas as pd
import os
import codecs


basepath = "aclImdb_v1\\aclImdb"

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with codecs.open(os.path.join(path, file), 'r', 'utf_8_sig') as infile:
                txt = infile.read()
            df = pd.concat([df, pd.DataFrame({'review': [txt], 'sentiment': [labels[l]]})], ignore_index=True)
            pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:25


In [20]:
import numpy as np


np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [22]:
from sklearn.feature_extraction.text import CountVectorizer


count = CountVectorizer()
docs = np.array(['The sun is shining', 'The weather is sweet', 'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [23]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [24]:
from sklearn.feature_extraction.text import TfidfTransformer


tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [27]:
def tokenizer(text):
    return [s for s in map(lambda x: x.lower(), text.split())]

tokenizer('The sun is shining')

['the', 'sun', 'is', 'shining']

In [28]:
from nltk.stem.porter import PorterStemmer


def tokenizer_porter(text):
    return [porter.stem(word) for word in tokenizer(text)]

porter = PorterStemmer()
tokenizer_porter('The sun is shining')

['the', 'sun', 'is', 'shine']

In [29]:
import nltk


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\My
[nltk_data]     Computer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [30]:
from nltk.corpus import stopwords


stop = stopwords.words('english')
[w for w in tokenizer_porter('The sun is shining')[-10:] if w not in stop]

['sun', 'shine']

In [31]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer # комбинация CountVectorizer и TfidfTransformer


tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) # преобразование и нормализация текста
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=42, solver='lbfgs'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, cv=5, scoring='accuracy', verbose=21, n_jobs=1) # verbose - для вывода прогресса обучения
gs_lr_tfidf.fit(X_train, y_train) # это займёт очень много времени, потому что большой глоссарий и много комбинаций для решетчатого поиска

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [48]:
def tokenizer(text):
    tokenized = [s for s in map(lambda x: x.lower(), text.split())]
    stop = stopwords.words('english')
    [w for w in tokenized if w not in stop]

    return tokenized

def stream_docs(path):
    with codecs.open(path, 'r', 'utf_8_sig') as csv:
        next(csv) # пропуск заголовка
        for line in csv:
            text, label = line[:-2], int(line[-3])
            yield text, label


next(stream_docs(path='movie_data.csv'))

('"Perspective is a good thing. Since the release of ""Star Wars Episode I: The Phantom Menace"", claims and counter-claims of just how Episode\'s II and III will eventuate has taken the spotlight off the \'original\' Star Wars films, making them part of a cohesive whole, rather than segregating the older and new films into separate trilogies. What the new films have done is allow fresh perspectives to be placed on the older films. This new outlook allows us to greater appreciate what has often been viewed as the weakest of the original trilogy: ""Return of the Jedi"". Often derided for its overly \'cute\' factor, ROTJ is in a sense as strong as the original and only slightly less impressive than the nearly perfect ""The Empire Strikes Back"". Indeed the \'cute\' element of ROTJ, namely the Ewoks, remains a weak link in the entire series. Did George Lucas place the furry midgets in the film purely for the merchandising possibilities? Only he can answer that question.<br /><br />This cu

In [49]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [50]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(n_features=2**21, decode_error='ignore', preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=42)
doc_stream = stream_docs(path='movie_data.csv')

In [76]:
import pyprind


pbar = pyprind.ProgBar(50000)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_text, y_test = get_minibatch(doc_stream, size=5000)
X_text = vect.transform(X_text)
print('Test Accuracy: %.3f' % clf.score(X_text, y_test))

In [None]:
clf = clf.partial_fit(X_text, y_test) # дообучаем на тестовых образцах