In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
count = CountVectorizer()
docs = np.array([
       'The sun is shining',
       'The weather is sweet',
       'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [5]:
print(count.vocabulary_)

{'and': 0, 'is': 1, 'shining': 2, 'weather': 6, 'sun': 3, 'sweet': 4, 'the': 5}


In [6]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


#### term frequency-inverse document frequency (tf-idf)

Scikit-learn implements yet another transformer, the TfidfTransformer, that takes the raw term frequencies from CountVectorizer as input and transforms them into tf-idfs:

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [27]:
%%time
df = pd.read_csv('./movie_data.csv', index_col=0)
print(df.shape)

(50000, 2)
CPU times: user 700 ms, sys: 104 ms, total: 804 ms
Wall time: 859 ms


In [28]:
df.loc[0, 'review'][-50:]

'ore talented people. Rene Russo as always was hot.'

In [29]:
df.head()

Unnamed: 0,review,sentiment
0,Al Pacino was once an actor capable of making ...,0
1,"If you read the book by Carl Hiaasen, the movi...",1
2,This movie is sort of a Carrie meets Heavy Met...,1
3,This movie was like a bad indie with A-list ta...,0
4,"In the '70s, Charlton Heston starred in sci-fi...",1


* we will now remove all punctuation marks but only keep emoticon characters such as ":)" 

In [32]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [33]:
preprocessor(df.loc[0, 'review'][-50:])

'ore talented people rene russo as always was hot '

In [34]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

* Apply preprocessor over all reviews

In [35]:
df['review'] = df['review'].apply(preprocessor)

In [36]:
df.head()

Unnamed: 0,review,sentiment
0,al pacino was once an actor capable of making ...,0
1,if you read the book by carl hiaasen the movie...,1
2,this movie is sort of a carrie meets heavy met...,1
3,this movie was like a bad indie with a list ta...,0
4,in the 70s charlton heston starred in sci fi f...,1


## Processing documents into tokens

In [37]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

#### word stemming:
runing = run

thanks to nltk

In [38]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
   return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

### Remove stop words
* Stop-words are simply those words that are extremely common in all sorts of texts and likely bear no (or only little) useful information that can be used to distinguish between different classes of documents. 
* Examples of stop-words are is, and, has, and the like. 

In [40]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Luke/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### Training a logistic regression model for document classification

In [42]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [47]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1,1)],
                'vect__stop_words': [stop, None],
                'vect__tokenizer': [tokenizer,
                                    tokenizer_porter],
                'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1.0, 10.0, 100.0]},
                {'vect__ngram_range': [(1,1)],
                'vect__stop_words': [stop, None],
                'vect__tokenizer': [tokenizer,
                                    tokenizer_porter],
                'vect__use_idf':[False],
                'vect__norm':[None],
                'clf__penalty': ['l1', 'l2'],
                'clf__C': [1.0, 10.0, 100.0]}]
              
lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf',
                     LogisticRegression(random_state=0))])
              
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy',
                          cv=3, verbose=1,
                          n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 42.4min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', '...x130b9cd90>], 'vect__use_idf': [False], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [48]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'vect__stop_words': None, 'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer at 0x130b9c9d8>, 'clf__penalty': 'l2'} 


In [49]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

CV Accuracy: 0.892
Test Accuracy: 0.899
