In [1]:
pip install pyprind

Note: you may need to restart the kernel to use updated packages.




In [20]:
import os
import re
import nltk
import tarfile
import pyprind
import pandas as pd
import numpy as np

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\epdls\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [22]:
# with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
#     tar.extractall()

In [23]:
basedir = 'aclImdb'

In [24]:
labels = {'pos':1, 'neg':0}

In [25]:
pbar = pyprind.ProgBar(5000)

In [26]:
df = pd.DataFrame()

In [27]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basedir, s, l)
        
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


In [28]:
df.columns = ['review', 'sentiment']

In [29]:
np.random.seed(0)

In [30]:
df = df.reindex(np.random.permutation(df.index))

In [31]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [32]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [33]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [36]:
porter = PorterStemmer()

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    text = (re.sub('[\W]+', ' ', text_lower())+' '.join(emoticons).replace('-', ''))
    
    return text

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [37]:
stop = stopwords.words('english')

In [39]:
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words':[stop, None],
               'vect__tokenizer':[tokenizer, tokenizer_porter],
               'clf__penalty':['l1', 'l2'],
               'clf__C':[1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words':[stop, None],
               'vect__tokenizer':[tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1', 'l2'],
               'clf__C':[1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(solver='liblinear', random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=1)

In [40]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits








GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've"...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
 

In [41]:
def tokenize(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    
    text = (re.sub('[\W]+', ' ', text_lower())+' '.join(emoticons).replace('-', ''))
    
    tokenized = [w for w in text.split() if w not in stop]
    
    return tokenized

In [42]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label # 제너레이터(이터레이터를 생성해주는 함수) 개념, 지역함수 밖으로 값 전달

In [49]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            
            y.append(label)
    except StopIteration:
        pass
    
    return docs, y

In [50]:
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [51]:
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    
    if not X_train:
        break
    
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [52]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)

In [53]:
import pickle

In [64]:
drt = os.path.join('movie-classifier','pkl_objects')

if not os.path.exists(drt):
    os.makedirs(drt)

pickle.dump(stop, open(os.path.join(drt, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(drt, 'classifier.pkl'), 'wb'), protocol=4)

In [65]:
cur_dir = os.path.dirname(os.path.realpath('__file__'))

stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))

In [66]:
def toknizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    
    text = (re.sub('[\W]+', ' ', text_lower())+' '.join(emoticons).replace('-', ''))
    
    tokenized = [w for w in text.split() if w not in stop]
    
    return tokenized

In [67]:
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)