## Working with Text Data

In [3]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np

In [4]:
amazon_df = pd.read_csv('https://raw.githubusercontent.com/nealcaren/CSSS-CABD/master/files/amazon_reviews.csv')

In [5]:
amazon_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Positive Review
0,52147,B000EUT8EU,A1RUIFCRZSAQB2,missmoni4x4,0,0,5,1258156800,The best!!!,"These are the best ""sugar free"" (non-enriched ...",1
1,155382,B000GAT6NG,A32HOM5BOKGXWB,"Erica Gott ""foodie""",0,0,5,1233792000,Virgin Coconut Oil,I buy this regularly at Whole Foods for about ...,1
2,273760,B000LKTY7Y,A3PRV5LSGOGZRC,Ray A. Van Ostran,7,9,5,1204934400,Mori-Nu Tofu Lite,"Mori-Nu Tofu, Lite, Silken, Firm, 12.3-Ounce B...",1
3,204752,B001EPPOHO,A3LAYCTGSO1IQR,"Purrrfectcat ""purrfectcat""",0,0,5,1328313600,"light, soft, stylishly beautiful, delicious!",I usually don't particularly like shortbread c...,1
4,203651,B004OQ257M,A1B6O7SAIYG2N0,"Jacx ""J.C.""",0,0,5,1316304000,If your already using Splenda but want the B v...,This is good for people that have to be on a s...,1


In [6]:
sample_text = amazon_df['Text'][15]
print(sample_text)

I bought a Wolfgang Puck sampler and this was certainly one of my favorites, so I ordered a bigger box and have truly been enjoying it.  This is a classic, medium-roast cup of coffee.  It's very smooth, with no bitterness.  I would say that if you enjoy Donut Shop, Tully's Kona, and Caribou, you will like this blend.  The label is adorable and makes me smile when I pop it in the Keurig each morning.


In [7]:
amazon_df['Score'].value_counts()

5    31933
4     7134
1     4638
3     3742
2     2553
Name: Score, dtype: int64

In [8]:
amazon_df['Positive Review'].value_counts()

1    31933
0    18067
Name: Positive Review, dtype: int64

### Representing text data as Bag of Words

![bag_of_words](https://raw.githubusercontent.com/nealcaren/CSSS-CABD/master/images/bag_of_words.png)

#### Applying bag-of-words to a toy dataset

In [9]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vect = CountVectorizer()

vect.fit(bards_words)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
len(vect.vocabulary_)


13

In [13]:
vect.get_feature_names()

[u'be',
 u'but',
 u'doth',
 u'fool',
 u'he',
 u'himself',
 u'is',
 u'knows',
 u'man',
 u'the',
 u'think',
 u'to',
 u'wise']

In [14]:
bag_of_words = vect.transform(bards_words)


In [15]:
# You won't ever due this

pd.DataFrame( bag_of_words.toarray(), columns=list(vect.get_feature_names()))


Unnamed: 0,be,but,doth,fool,he,himself,is,knows,man,the,think,to,wise
0,0,0,1,1,1,0,1,0,0,1,1,0,1
1,1,1,0,1,0,1,0,1,1,1,0,1,1


### Bag-of-word for product reviews

In [38]:
tf_vectorizer = CountVectorizer(max_features = 1000)

tf_vectorizer.fit(amazon_df['Text'])


CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [39]:
tf_vectorizer.get_feature_names()[1501:1510]

[]

In [40]:
len(list(tf_vectorizer.get_feature_names()))

1000

In [41]:
X_train = tf_vectorizer.transform(amazon_df['Text'])


In [42]:
list(tf_vectorizer.get_feature_names())[:100]

[u'00',
 u'10',
 u'100',
 u'11',
 u'12',
 u'15',
 u'16',
 u'20',
 u'24',
 u'25',
 u'30',
 u'50',
 u'able',
 u'about',
 u'absolutely',
 u'acid',
 u'actually',
 u'add',
 u'added',
 u'adding',
 u'addition',
 u'after',
 u'afternoon',
 u'aftertaste',
 u'again',
 u'ago',
 u'agree',
 u'all',
 u'allergies',
 u'almond',
 u'almonds',
 u'almost',
 u'along',
 u'already',
 u'also',
 u'alternative',
 u'although',
 u'always',
 u'am',
 u'amazing',
 u'amazon',
 u'amount',
 u'an',
 u'and',
 u'another',
 u'any',
 u'anymore',
 u'anyone',
 u'anything',
 u'anyway',
 u'anywhere',
 u'apple',
 u'are',
 u'area',
 u'aren',
 u'aroma',
 u'around',
 u'arrived',
 u'artificial',
 u'as',
 u'at',
 u'ate',
 u'available',
 u'avoid',
 u'away',
 u'awesome',
 u'awful',
 u'baby',
 u'back',
 u'bad',
 u'bag',
 u'bags',
 u'baked',
 u'baking',
 u'balance',
 u'bar',
 u'bars',
 u'based',
 u'batch',
 u'be',
 u'bean',
 u'beans',
 u'beat',
 u'because',
 u'become',
 u'beef',
 u'been',
 u'before',
 u'being',
 u'believe',
 u'benefits',


In [43]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


In [44]:
lr = LogisticRegression()

lr.fit(X_train, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
cross_val_score(lr, X_train, amazon_df['Positive Review'])

array([ 0.79673626,  0.80055202,  0.79545182])

In [46]:
r = pd.DataFrame(lr.coef_[0])
len(r)

1000

In [47]:
coef_df = pd.DataFrame(lr.coef_).T
coef_df['word'] = list(tf_vectorizer.get_feature_names())
coef_df.sort_values(0, ascending=False).head(20)

Unnamed: 0,0,word
65,1.146513,awesome
39,1.134461,amazing
409,1.132305,highly
979,1.057622,wonderful
236,0.941916,delicious
998,0.904465,yum
292,0.891726,excellent
364,0.886924,glad
866,0.881809,thank
307,0.85835,fantastic


In [49]:
coef_df.sort_values(0, ascending=True).head(20)

Unnamed: 0,0,word
722,-1.386423,return
917,-1.217758,unfortunately
863,-1.199489,terrible
66,-1.137127,awful
419,-1.131232,horrible
418,-1.106331,hoping
247,-0.949102,disappointed
595,-0.939736,okay
99,-0.918957,bland
425,-0.887868,however


### Stop-words

In [250]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))

print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword:
['all', 'not', 'one', 'should', 'latterly', 'cannot', 'name', 'each', 'ten', 'beyond', 'mine', 'between', 'full', 'found', 'anything', 'became', 'formerly', 'everyone', 'three', 'anyone', 'was', 'becoming', 'he', 'besides', 'something', 'herein', 'any', 'meanwhile', 'which', 'most', 'whereby', 'rather']


In [50]:
# Specifying stop_words="english" uses the built-in list.
# We could also augment it and pass our own.
tf_vectorizer = CountVectorizer(min_df=.01, 
                                stop_words="english").fit(amazon_df['Text'])

X_train = tf_vectorizer.transform(amazon_df['Text'])


In [51]:
len(tf_vectorizer.get_feature_names())

586

In [52]:
lr.fit(X_train, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [53]:
print accuracy_score(amazon_df['Positive Review'], lr.predict(X_train))
confusion_matrix(amazon_df['Positive Review'], lr.predict(X_train))

0.7804


array([[10550,  7517],
       [ 3463, 28470]])

In [54]:
coef_df = pd.DataFrame(lr.coef_).T
coef_df['word'] = list(tf_vectorizer.get_feature_names())
coef_df.sort_values(0, ascending=False).head(20)

Unnamed: 0,0,word
257,1.244312,highly
29,1.16985,awesome
18,1.111986,amazing
573,1.083443,wonderful
229,1.009629,glad
520,0.964354,thank
147,0.959266,delicious
183,0.907099,excellent
322,0.836119,loves
194,0.824724,fantastic


In [55]:
coef_df.sort_values(0, ascending=True).head(20)

Unnamed: 0,0,word
543,-1.271479,unfortunately
157,-1.026424,disappointed
488,-0.956496,stars
485,-0.891954,stale
363,-0.883145,ok
144,-0.83092,decent
486,-0.795837,star
332,-0.721214,maybe
267,-0.694989,http
563,-0.688406,weak


### your turn

Go back and spit our dataset into a training and test set. Run and test a model with a large vocabulary one one with a smaller vocabulary. How does fit on the test/train sets compare?

In [57]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(amazon_df['Text'], amazon_df['Positive Review'], train_size = .8)


In [61]:
# Big Vocab
big_vector = CountVectorizer(max_features  = 5000)

big_vector.fit(X_train)
tf_train = big_vector.transform(X_train)



In [64]:
lr.fit(tf_train, y_train)

print accuracy_score(y_train, lr.predict(tf_train))
confusion_matrix(y_train, lr.predict(tf_train))

0.865275


array([[10965,  3515],
       [ 1874, 23646]])

In [65]:
tf_test = big_vector.transform(X_test)

print accuracy_score(y_test, lr.predict(tf_test))
confusion_matrix(y_test, lr.predict(tf_test))

0.8055


array([[2395, 1192],
       [ 753, 5660]])

In [66]:
# Small Vocab
small_vector = CountVectorizer(stop_words='english',
                            min_df=.01)

small_vector.fit(X_train)
tf_train = small_vector.transform(X_train)

lr.fit(tf_train, y_train)

print accuracy_score(y_train, lr.predict(tf_train))
confusion_matrix(y_train, lr.predict(tf_train))

tf_test = small_vector.transform(X_test)

print accuracy_score(y_test, lr.predict(tf_test))
confusion_matrix(y_test, lr.predict(tf_test))

0.7809
0.7707


array([[2034, 1553],
       [ 740, 5673]])

### Rescaling the Data with tf-idf
\begin{equation*}
\text{tfidf}(w, d) = \text{tf} \log\big(\frac{N + 1}{N_w + 1}\big) + 1
\end{equation*}

In [257]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [258]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

In [259]:
vect = TfidfVectorizer()
vect.fit(bards_words)
bag_of_words = vect.transform(bards_words)


In [260]:
pd.DataFrame( bag_of_words.toarray(), columns=list(vect.get_feature_names()))


Unnamed: 0,be,but,doth,fool,he,himself,is,knows,man,the,think,to,wise
0,0.0,0.0,0.425677,0.302873,0.425677,0.0,0.425677,0.0,0.0,0.302873,0.425677,0.0,0.302873
1,0.364693,0.364693,0.0,0.259482,0.0,0.364693,0.0,0.364693,0.364693,0.259482,0.0,0.364693,0.259482


In [261]:
tfidf_vect = TfidfVectorizer(min_df=.01,
                stop_words="english")



In [262]:
tfidf_vect.fit(amazon_df['Text'])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [263]:
tfidf = tfidf_vect.transform(amazon_df['Text'])

In [273]:
lr.fit(tfidf, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [274]:
print accuracy_score(amazon_df['Positive Review'], lr.predict(tfidf))
confusion_matrix(amazon_df['Positive Review'], lr.predict(tfidf))

0.78094


array([[11047,  7020],
       [ 3933, 28000]])

In [266]:
coef_df = pd.DataFrame(lr.coef_).T
coef_df['word'] = list(tf_vectorizer.get_feature_names())
coef_df.sort_values(0, ascending=False).head(20)

Unnamed: 0,0,word
257,5.056468,highly
41,4.999664,best
147,4.663251,delicious
573,4.626625,wonderful
239,4.544307,great
320,4.377871,love
18,4.240002,amazing
183,3.840702,excellent
393,3.773116,perfect
322,3.7078,loves


#### Bag of words with more than one word (n-grams)

In [267]:
print("bards_words:\n{}".format(bards_words))

bards_words:
['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']


In [268]:
vect = CountVectorizer(ngram_range=(1,2))


In [269]:
vect.fit(bards_words)
vect.get_feature_names()

[u'be',
 u'be fool',
 u'but',
 u'but the',
 u'doth',
 u'doth think',
 u'fool',
 u'fool doth',
 u'he',
 u'he is',
 u'himself',
 u'himself to',
 u'is',
 u'is wise',
 u'knows',
 u'knows himself',
 u'man',
 u'man knows',
 u'the',
 u'the fool',
 u'the wise',
 u'think',
 u'think he',
 u'to',
 u'to be',
 u'wise',
 u'wise man']

In [270]:
bag_of_words = vect.transform(bards_words)

pd.DataFrame( bag_of_words.toarray(), columns=list(vect.get_feature_names()))

Unnamed: 0,be,be fool,but,but the,doth,doth think,fool,fool doth,he,he is,...,man knows,the,the fool,the wise,think,think he,to,to be,wise,wise man
0,0,0,0,0,1,1,1,1,1,1,...,0,1,1,0,1,1,0,0,1,0
1,1,1,1,1,0,0,1,0,0,0,...,1,1,0,1,0,0,1,1,1,1


In [136]:
vect = CountVectorizer(ngram_range=(1,3))
vect.fit(bards_words)
vect.get_feature_names()

[u'be',
 u'be fool',
 u'but',
 u'but the',
 u'but the wise',
 u'doth',
 u'doth think',
 u'doth think he',
 u'fool',
 u'fool doth',
 u'fool doth think',
 u'he',
 u'he is',
 u'he is wise',
 u'himself',
 u'himself to',
 u'himself to be',
 u'is',
 u'is wise',
 u'knows',
 u'knows himself',
 u'knows himself to',
 u'man',
 u'man knows',
 u'man knows himself',
 u'the',
 u'the fool',
 u'the fool doth',
 u'the wise',
 u'the wise man',
 u'think',
 u'think he',
 u'think he is',
 u'to',
 u'to be',
 u'to be fool',
 u'wise',
 u'wise man',
 u'wise man knows']

In [271]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [272]:
pipe = make_pipeline(TfidfVectorizer(min_df=.01,
                                    stop_words = 'english'), 
                     LogisticRegression())
#
param_grid = {"tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

grid = GridSearchCV(pipe, param_grid, cv=3)

grid.fit(amazon_df['Text'], amazon_df['Positive Review'])
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

Best cross-validation score: 0.77
Best parameters:
{'tfidfvectorizer__ngram_range': (1, 1)}


In [148]:
pd.DataFrame(grid.cv_results_)[['param_tfidfvectorizer__ngram_range','mean_test_score']]

Unnamed: 0,param_tfidfvectorizer__ngram_range,mean_test_score
0,"(1, 1)",0.77248
1,"(1, 2)",0.7723
2,"(1, 3)",0.77218


### Cleaning Text on your own

In [291]:
string = '<br> Some random crap.'

In [292]:
string.replace('<br>', '')

' Some random crap.'

In [293]:
def clean_string(string):
    clean_string = string.replace('<br>', '')
    return clean_string 

In [294]:
clean_string('<br> Some random crap.')

' Some random crap.'

In [298]:
amazon_df['clean_text'] = amazon_df['Text'].apply(clean_string)

In [299]:
amazon_df['clean_text'].head()

0    These are the best "sugar free" (non-enriched ...
1    I buy this regularly at Whole Foods for about ...
2    Mori-Nu Tofu, Lite, Silken, Firm, 12.3-Ounce B...
3    I usually don't particularly like shortbread c...
4    This is good for people that have to be on a s...
Name: clean_text, dtype: object

In [303]:
def clean_string(string):
    clean_string = string.replace('<br>', '')
    clean_string = clean_string.replace("n't", " not")
    return clean_string 

In [309]:
def clean_string(string):
    clean_string = string.replace('<br>', '')
    clean_string = clean_string.replace("n't", " not")
    
    clean_sentence = ''
    for word in clean_string.split():
        word = word.strip('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
        clean_sentence =clean_sentence + ' ' + word
    return clean_sentence 

In [310]:
clean_string('Did! that work?<br>')

' Did that work'

In [311]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

In [315]:
stemmer.stem('dogs')

u'dog'

In [316]:
stemmer.stem('dog')

u'dog'

In [321]:
stemmer.stem('barking')

u'bark'

In [330]:
def clean_string(string):
    clean_string = string.replace('<br>', '')
    clean_string = clean_string.replace("n't", " not")
    
    clean_sentence = u''
    for word in clean_string.split():
        word = word.strip('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
        try:
            word = stemmer.stem(word)
        except:
            word = word
        try:
            clean_sentence =clean_sentence + ' ' + word
        except:
            print word
    return clean_sentence 

In [331]:
clean_string('The dog went running with other dogs.')

u' the dog went run with other dog'

In [332]:
amazon_df['clean_text'] = amazon_df['Text'].apply(clean_string)

amazon_df['clean_text'].head(10)

Organic®
185°F
185°F
½
105°F
ENERGY®
Ty·phoo
Salt®
brûl&eacute;e
brûl&eacute;e
Quaker®
href="http://www.amazon.com/gp/product/B004KJXSHO">Keurig®
¾
Brand®
Brand®
Marzano®
pât&eacute
pât&eacute
Çaykur
90º
2¼
7½-year-old
7½-year-old
2½
1½
ENERGY®
60¢/k-cup
AvoDerm®
AvoDerm®
AvoDerm®
AvoDerm®
AvoDerm®
58¢
ENERGY®
ENERGY®
Eatin'®
½
99¢
¾
Mountain®
VIA®
96¢
12¢
6¢
8¢
1.37½
bar­becue
thinkThin®
thinkThin®
Eatin'®
104°
400°
200°
180°
103°
®
pât&eacute
Rêverie
så
DecoBros®</a
Kelloggs®
href="http://www.amazon.com/gp/product/B000G72D70">swissgold®
500°
350°
Organic®
¼
½
1½
8½-ounce
½
¼
½
½
400°.<br
Water®
Jel®
Jel®
Jel®
Jel®
Jel®
Jel®
¾
¼
Own®Organics
Noël
Foods,®
AUTOSPOUT®
ENERGY®
VIA®
§
§
113°F
113°F
50ºF.<br
Niçoise
VIA®
ENERGY®
Own®
½
ENERGY®
brûl&eacute;e
½
©2010
Almond®
NutraSweet®/aspartame
175°
½
½
ENERGY®
Gold®
pât&eacute
WellPet®
WELLNESS®
WELLNESS®
Program
ENERGY®
href="http://www.amazon.com/gp/product/B002WWVDK0">Sexergy®
Starbucks®
Caribou®
Coffee®
House®
Folgers®
O'Clock®
Starbu

0     these are the best sugar free non-enrich whea...
1     i buy this regular at whole food for about 30...
2     mori-nu tofu lite silken firm 12.3-ounc box p...
3     i usual do not particular like shortbread coo...
4     this is good for peopl that have to be on a s...
5     my uncl bilbo alway woke up to a hearti bowl ...
6     this is the herbal remedi of choic for those ...
7     just receiv these today and was stun to see t...
8     best coffe ever after tast these fresh ground...
9     i have been buy ella sinc my son was 6 month ...
Name: clean_text, dtype: object

In [334]:
vect = CountVectorizer(ngram_range=(1,1), min_df=.01, max_df=.5, stop_words = 'english')
vect.fit(amazon_df['clean_text'])

clean_tf = vect.transform(amazon_df['clean_text'])


In [335]:
lr.fit(clean_tf, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [336]:
print accuracy_score(amazon_df['Positive Review'], lr.predict(clean_tf))
confusion_matrix(amazon_df['Positive Review'], lr.predict(clean_tf))

0.7876


array([[10755,  7312],
       [ 3308, 28625]])

### Naive Bayes

$$
\frac{P(L_1~|~{\rm features})}{P(L_2~|~{\rm features})} = \frac{P({\rm features}~|~L_1)}{P({\rm features}~|~L_2)}\frac{P(L_1)}{P(L_2)}
$$


In [68]:
from sklearn.naive_bayes import MultinomialNB


nb = MultinomialNB()

In [284]:
tf_vectorizer = CountVectorizer(min_df=.01, 
                                stop_words="english").fit(amazon_df['Text'])

X_train_tf = tf_vectorizer.transform(amazon_df['Text'])


In [278]:
nb.fit(X_train_tf, amazon_df['Positive Review'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [285]:
print accuracy_score(amazon_df['Positive Review'], nb.predict(tfidf))
confusion_matrix(amazon_df['Positive Review'], nb.predict(tfidf))

0.7339


array([[ 5908, 12159],
       [ 1146, 30787]])

In [289]:
nb_probs = pd.DataFrame(nb.feature_log_prob_).T
nb_probs['word'] = tf_vectorizer.get_feature_names()
nb_probs.sort_values(1, ascending=False).head(10)

Unnamed: 0,0,1,word
58,-3.739278,-3.992257,br
239,-5.162631,-4.347369,great
320,-5.434581,-4.52661,love
235,-4.507674,-4.589347,good
303,-4.335316,-4.624931,like
515,-4.862358,-4.640573,tea
101,-4.489168,-4.655693,coffee
415,-4.565865,-4.735544,product
286,-4.744796,-4.811386,just
510,-4.513256,-4.847411,taste


In [290]:
nb_probs.sort_values(0, ascending=False).head(10)

Unnamed: 0,0,1,word
58,-3.739278,-3.992257,br
303,-4.335316,-4.624931,like
101,-4.489168,-4.655693,coffee
235,-4.507674,-4.589347,good
510,-4.513256,-4.847411,taste
415,-4.565865,-4.735544,product
208,-4.695297,-4.858587,flavor
286,-4.744796,-4.811386,just
515,-4.862358,-4.640573,tea
163,-5.069015,-5.375158,don


### Topic Modeling and Document Clustering
#### Latent Dirichlet Allocation

In [209]:
vect = CountVectorizer(max_features=5000, 
                       max_df=.5,
                       stop_words = 'english', 
                      min_df = .01)

X_train = vect.fit_transform(amazon_df['Text'])

In [210]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_topics=10,
                                      learning_method = 'batch',
                                      max_iter=25)


In [211]:
lda_model.fit(X_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=25, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [212]:
document_topics = lda_model.transform(X_train)

In [213]:
print("lda.components_.shape: {}".format(lda_model.components_.shape))

lda.components_.shape: (10, 586)


In [214]:
tm_df = pd.DataFrame(lda_model.components_).T
tm_df['words'] = vect.get_feature_names()
for k in tm_df.keys():
    print ', '.join(tm_df.sort_values(by=k, ascending=False)['words'].tolist()[:10])

bag, treats, chips, dog, like, treat, dogs, just, size, small
br, product, amazon, com, www, http, gp, href, pack, ounce
amazon, product, price, order, store, great, buy, box, good, ordered
like, flavor, taste, just, good, drink, really, sweet, flavors, sugar
food, cat, dog, cats, eat, old, like, chicken, foods, loves
use, water, product, just, like, great, make, oil, salt, butter
great, free, snack, good, gluten, love, bars, healthy, fat, eat
tea, green, teas, flavor, black, like, drink, taste, ginger, good
coffee, cup, like, cups, flavor, good, taste, strong, blend, beans
chocolate, like, taste, cookies, good, milk, hot, sauce, flavor, just
yummy, yes, years, year, www, wrong, wouldn, worth, world, works


In [218]:
lr = LogisticRegression()

lr.fit(document_topics, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [219]:
print accuracy_score(amazon_df['Positive Review'], lr.predict(document_topics))
confusion_matrix(amazon_df['Positive Review'], lr.predict(document_topics))

0.64106


array([[ 1682, 16385],
       [ 1562, 30371]])

In [221]:
lda_model_50 = LatentDirichletAllocation(n_topics=50,
                                      learning_method = 'batch',
                                      max_iter=25)
lda_model_50.fit(X_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=25, mean_change_tol=0.001,
             n_jobs=1, n_topics=50, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [222]:
document_topics_50 = lda_model_50.transform(X_train)


lr = LogisticRegression()

lr.fit(document_topics_50, amazon_df['Positive Review'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [224]:
print accuracy_score(amazon_df['Positive Review'], lr.predict(document_topics_50))
confusion_matrix(amazon_df['Positive Review'], lr.predict(document_topics_50))

0.7001


array([[ 7002, 11065],
       [ 3930, 28003]])

In [225]:
tm_df = pd.DataFrame(lda_model_50.components_).T
tm_df['words'] = vect.get_feature_names()
for k in tm_df.keys():
    print ', '.join(tm_df.sort_values(by=k, ascending=False)['words'].tolist()[:10])

popcorn, corn, packs, pop, bowl, just, great, like, taste, use
bottle, energy, red, powder, bottles, drinks, gives, oz, drink, caffeine
food, dog, foods, dry, feed, ingredients, pet, dogs, quality, diet
eat, food, cats, cat, chicken, like, picky, ate, eats, just
product, recommend, highly, great, excellent, recommended, good, taste, love, definitely
products, quality, company, fast, product, items, service, high, shipping, delivery
tea, teas, drink, like, good, cup, taste, iced, flavor, love
bought, try, reviews, great, thought, decided, read, thanks, fantastic, did
use, oil, coconut, cooking, cook, great, recipe, used, love, taste
vanilla, beans, ground, starbucks, french, coffee, aroma, taste, flavored, smell
chocolate, dark, cocoa, hot, milk, rich, taste, sweet, like, best
green, white, black, caffeine, color, decaf, mild, high, fresh, compared
low, diet, pasta, version, regular, alternative, jar, calorie, difference, great
coffee, blend, strong, roast, bold, like, bitter, smooth, t

In [67]:
document_topics_50

NameError: name 'document_topics_50' is not defined