In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [4]:
positive_reviews = BeautifulSoup(open('electronics/positive.review', encoding='utf-8').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')
positive_reviews[:1]

[<review_text>
 I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.
 
 I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.
 
 As always, Amazon had it to me in &lt;2 business days
 </review_text>]

In [5]:
negative_reviews = BeautifulSoup(open('electronics/negative.review', encoding='utf-8').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')
negative_reviews[:1]

[<review_text>
 cons
 tips extremely easy on carpet and if you have a lot of cds stacked at the top
 
 poorly designed, it is a vertical cd rack that doesnt have individual slots for cds, so if you want a cd from the bottom of a stack you have basically pull the whole stack to get to it
 
 putting it together was a pain, the one i bought i had to break a piece of metal just to fit it in its guide holes.
 
 again..poorly designed... doesnt even fit cds that well, there are gaps, and the cd casses are loose fitting
 
 pros
 ..........
 i guess it can hold a lot of cds....
 </review_text>]

In [6]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t)>2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens

In [7]:
# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [8]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)#取出resultset中的文字
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)#取出resultset中的文字
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 11082


In [9]:
def token_to_vector(tokens,label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # 正規化數據提升未來準確度
    x[-1] = label
    return x

In [10]:
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
i = 0
for tokens in positive_tokenized:
    xy = token_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = token_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [12]:
data

array([[0.02272727, 0.06818182, 0.02272727, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.08333333, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04545455, 0.        ,
        0.        ],
       [0.        , 0.05769231, 0.        , ..., 0.        , 0.01923077,
        0.        ]])

In [13]:
# shuffle data 創造 train/test splits
# 多次嘗試!
orig_reviews, data = shuffle(orig_reviews, data)

In [14]:
X = data[:,:-1]
Y = data[:,-1]
print('X: ',X)
print('Y: ',Y)

X:  [[0.         0.22222222 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01724138 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.03529412 0.         ... 0.         0.         0.        ]
 [0.         0.03225806 0.         ... 0.         0.         0.        ]
 [0.         0.0625     0.         ... 0.         0.         0.        ]]
Y:  [1. 0. 0. ... 0. 0. 0.]


In [15]:
# 最後 100 列是測試用
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [16]:
from sklearn.ensemble import AdaBoostClassifier
Ada_model = AdaBoostClassifier()

In [17]:
Ada_model.fit(Xtrain,Ytrain)
print("Train accuracy:", Ada_model.score(Xtrain, Ytrain))
print("Test accuracy:", Ada_model.score(Xtest, Ytest))

Train accuracy: 0.8121052631578948
Test accuracy: 0.79


In [18]:
Log_model = LogisticRegression()
Log_model.fit(Xtrain, Ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
print("Train accuracy:", Log_model.score(Xtrain, Ytrain))
print("Test accuracy:", Log_model.score(Xtest, Ytest))

Train accuracy: 0.7757894736842105
Test accuracy: 0.7


In [20]:
from sklearn.naive_bayes import MultinomialNB
Mul_model = MultinomialNB()
Mul_model.fit(Xtrain,Ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
print("Train accuracy:", Mul_model.score(Xtrain, Ytrain))
print("Test accuracy:", Mul_model.score(Xtest, Ytest))

Train accuracy: 0.8647368421052631
Test accuracy: 0.82


In [22]:
# 列出每個字的正負 weight
# 用不同的 threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = Mul_model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

purchased -8.294017043736527
this -5.857525284105261
unit -7.960842037842118
due -9.122293443530829
frequent -9.301534090077686
blackout -9.338776751824039
power -8.319179278667328
supply -9.16029475671369
bad -8.685466696150858
run -8.857648784198764
cable -7.644412090817295
modem -9.080716958392227
router -8.821463070065247
lcd -9.071899317241407
monitor -8.742793036645066
minute -8.781013835561367
time -7.518427917746379
save -8.922849765433705
shut -9.35622333158784
equally -9.27919904247734
electronics -9.222190995369642
receiving -9.324325130513365
clean -8.927486015915028
feel -8.787873072230278
investment -9.020629913271467
minor -9.045286532119524
compared -8.950674561194536
loss -9.306695060743138
valuable -9.373101494365114
data -8.793058861498848
failure -9.27889077429001
equipment -9.051639398005795
spike -9.36749781184628
irregular -9.373101494365114
amazon -7.887570659874919
business -9.15089352329933
day -8.139528560548106
apc -9.28762236772578
back-ups -9.3648026915504

admitt -9.32146637806345
humming -9.32146637806345
hissing -9.312661715698301
idle -9.302693263555526
ohtherwise -9.32146637806345
seen -8.909549151798528
try -8.878361700847963
energy -9.369598863813911
source -9.233154265298277
hope -9.2418969433497
gon -9.369598863813911
fry -9.350430295686285
usb -8.23271074488882
port -8.899414882846221
wattage -9.357927969670804
sprare -9.369598863813911
pci -9.289676135547108
slot -9.137802126640022
damn -9.369598863813911
7900 -9.369598863813911
gtx -9.369598863813911
graphic -9.330751115244627
trade -9.36170845742858
offs -9.369598863813911
reality -9.350992872164095
awfully -9.361644038529409
college -9.283044029197109
dorm -9.353734691874264
traditional -9.306600858252036
sharp -9.265262689217993
audio -8.71581485466258
play -8.5905984634369
final -9.375771722920993
fantasy -9.373776988357081
life -8.83842362101359
playing -9.029086521571006
playstation -9.375771722920993
console -9.335548258965039
downside -9.263285468504064
switch -8.83722

potret -9.32146637806345
ariba -9.32146637806345
hopi -9.357834022234325
bon -9.357834022234325
facil -9.357834022234325
pone -9.306600466787621
saca -9.357834022234325
via -9.019619122991678
systema -9.357834022234325
aki -9.357834022234325
complex -9.191243827010892
capable -9.22418725373658
expected -8.541399332462786
expecting -9.24050323968432
random -9.124124581712577
quirk -9.305016298544112
continued -9.27153417200779
none -8.991206610661617
whatsoever -9.12289979569948
earbuds -9.054202152322084
alswys -9.36890610313501
headphone -8.13162140928971
pair -8.714399925474174
bud -9.117567719316941
impressed -9.051987719389217
rounded -9.363684896043397
bass -8.555826051789298
treble -9.291798350834428
flexible -9.275951320082783
'case -9.36890610313501
tough -9.299848134681113
purse -9.29523339416285
backpack -8.93912039174495
kid -8.891933831614931
nascar -9.296879356519057
racing -9.30238838162023
reprogramable -9.308562973227543
ready -9.143207463798207
immediately -9.190543283

overlook -9.381551763121504
zoo -9.389225122538514
unzipping -9.389225122538514
safer -9.389225122538514
built -8.903467698802421
rain/snow -9.389225122538514
destroyed -9.361315217782343
yard -9.342514148192391
hopelessly -9.389225122538514
searching -9.31199082844704
ipo -9.213252793423218
16x -9.155039056191093
reccommend -9.344281055829622
medium -8.891682458669816
manufactured -9.347808776301308
soft -9.282709293775854
gray -9.379045048265962
material -9.207098369664806
leather -9.227764434457486
string -9.352822198183075
closure -9.379045048265962
dust -9.29201265426306
element -9.306125463475958
precious -9.350331373425613
meant -9.304119368569848
16-35mm -9.379045048265962
f/2.8l -9.379045048265962
usm -9.362784527394181
17-40mm -9.379045048265962
f/4l -9.379045048265962
10-22mm -9.379045048265962
f/3.5-4.5 -9.379045048265962
ef/efs -9.379045048265962
suggests -9.379045048265962
replacing -9.315014646280343
glass -9.326416135468348
e.g -9.319329927618256
hi-res -9.3388830177640

shop -9.33676206464286
wide -9.131171940839241
rock -9.137800524116864
techno/electronic -9.375771722920993
classical -9.316436849976725
locate -9.272411241725354
apprehensive -9.375771722920993
tight -9.144517273648804
boomy -9.346919660229787
soundstage -9.37012071330776
expansive -9.375771722920993
imaging -9.34577907170528
scan -9.255070437229364
local -9.12592846246316
floor -9.204352075132473
living -9.223872716605293
sep -9.374955063014436
region-free -9.374955063014436
code -9.241092869544568
dvd-224m -9.334949728400737
dvd-224 -9.354752355696917
hack -9.374955063014436
tray -9.346123672561419
title -9.351506994671134
appear -9.160572719465556
region -9.346977778149787
germany -9.374955063014436
arrow -9.351728540869287
raise -9.314482566850026
enter -9.212185017150285
planed -9.10789227776539
carr -9.10789227776539
compare -9.168573287966886
nine -9.281990855378915
tine -9.290213834559346
sturdily -9.290213834559346
execution -9.362784527394181
flawed -9.357957486645866
gaming

play/pause/next/prev -9.381389715225216
griffin -9.362539625352525
neither -9.308371987826035
charged -9.232040498793285
lasted -9.276345272528758
pump -9.364436303428569
consistant -9.384013527816096
1-2-3 -9.384013527816096
clearer -9.377219379386935
happier -9.22840381920232
view -9.214664655998787
320 -9.373595443498397
240 -9.354219164535259
archive -9.373595443498397
playable -9.373595443498397
opposed -9.323821064539292
tivo -9.23039053496739
acceptable -9.229696958182434
outlet -8.963631988074651
pixma -9.22452185958731
sol -9.277791314560789
marketed -9.387135481571308
spectacular -9.282615655328275
toslink -9.387135481571308
dts -9.361008305607674
'boomier -9.387135481571308
tad -9.36365257377247
crisper -9.387135481571308
dolby -9.361008305607674
gladiator -9.383428515778217
honestly -9.387135481571308
1010 -9.387135481571308
watt -9.371990978232667
add -8.894117228349543
closer -9.387135481571308
physically -9.373597720499987
macbook -9.346395042894951
incredible -9.1816067

dry -9.272138660779703
instantly -9.268972487710178
r200 -9.386685402799927
edge -9.364869292267544
superdrive -9.386685402799927
firewire -9.363631999435954
writer -9.386685402799927
dmr-e55 -9.386685402799927
recommends -9.386685402799927
encountered -9.361960195992774
sanyo -9.386685402799927
apex -9.386685402799927
ps2 -9.283115140433539
134 -9.386685402799927
webcam -9.28615402275177
cancellation -9.378479916857872
interchangeability -9.378479916857872
fusion -9.378479916857872
technical -9.31457900833564
gieco -9.376156264360072
alien -9.369415111345305
champion -9.376156264360072
bored -9.376156264360072
uncle -9.376156264360072
admit -9.300110817890891
parent -9.376156264360072
tiny -9.238951568735185
developed -9.326262550326943
carpel -9.351122587646339
tunnel -9.326262550326943
activity -9.229641158266258
gone -9.174493954147316
movable -9.351122587646339
action -9.301188357539175
splice -9.376156264360072
a/v -9.340635007244716
retaining -9.376156264360072
i.e. -9.376156264

include -9.367949996577442
customize -9.391836028106566
his/her -9.391836028106566
and/or -9.314590171442944
aesthetically -9.391836028106566
pleasing -9.391836028106566
rectangular -9.375635692002039
attractiveness -9.391836028106566
reproducing -9.391836028106566
sound-granted -9.391836028106566
paired -9.362955106724645
competent -9.382390654776353
qualm -9.391836028106566
hidden -9.385888071743404
marking -9.391836028106566
belong -9.381065234778049
to-a -9.391836028106566
hide -9.391836028106566
serious -9.341437919379283
onkyo -9.391836028106566
related -9.391836028106566
presently -9.391836028106566
mesh -9.391836028106566
tag-its -9.391836028106566
target -9.368034988432806
audience -9.391836028106566
affluent -9.391836028106566
unable -9.281824858171385
soundtrack -9.389809023468601
matrix -9.391836028106566
iii -9.391836028106566
snatch -9.391836028106566
ranging -9.391836028106566
techno -9.384529956779884
differentiating -9.391836028106566
drowning -9.391836028106566
mind -

shake -9.307340327724043
concern -9.346589549344296
subjective -9.36684416184688
sexy -9.392096085840848
*large* -9.392096085840848
adjustability -9.392096085840848
notch -9.332286096572041
fooled -9.392096085840848
sane -9.392096085840848
facing -9.360752408242924
dialogue -9.366326317532089
confident -9.385188449661689
cd-player -9.387206100546656
universal -9.368684653484193
seperate -9.382260019264898
seperately -9.387206100546656
describe -9.17144764841182
reciever -9.22072509995167
integrated -9.363370368555486
punchier -9.374068144996208
pandora -9.353014735798377
wondered -9.364251879088132
explorer -9.35057856732283
comforable -9.380536472852633
shame -9.369404632483787
glowing -9.344702019843416
chameleon -9.363978141700116
paint -9.35606772697338
improvment -9.380536472852633
skill -9.380536472852633
cheesy -9.380536472852633
gui -9.380536472852633
homemade -9.380536472852633
spends -9.380536472852633
razor -9.22700034342881
diamondback -9.21278209442653
mousing -9.378767231

cute -9.275515917887754
telling -9.32304800426365
sanso -9.384879061100424
outweighed -9.390212407075786
hardly -9.352763926780973
pressesd -9.390212407075786
3am -9.390212407075786
tuffwrap -9.390212407075786
accent -9.370317638918406
xtrememac- -9.390212407075786
shack -9.361034834411889
picky -9.320228561309376
lining -9.390212407075786
tunebase -9.236825961141285
bleed -9.390212407075786
nasty -9.390212407075786
surf -9.379796812859844
fingertip -9.390212407075786
gut -9.390212407075786
frustration -9.390212407075786
beaten -9.374068144996208
on-line -9.374068144996208
hewlett -9.374068144996208
packard -9.374068144996208
c3906a -9.374068144996208
payed -9.374068144996208
cartrige -9.33239544859564
twords -9.374068144996208
letting -9.374068144996208
allmost -9.374068144996208
reorder -9.374068144996208
arrives -9.367564516813108
bear -9.383609554832322
konk -9.385624019364004
tedious -9.385624019364004
involving -9.385624019364004
copied -9.36117292349984
removeable -9.38562401936

independant -9.384524514030588
deduction -9.384524514030588
transitioning -9.384524514030588
zone -9.384524514030588
explanation -9.357422584252797
grounded -9.384524514030588
envision -9.361672798541491
en-7100 -9.361672798541491
resisted -9.361672798541491
spending -9.207496952755731
dvp-s560d -9.37688221720502
higher-res -9.37688221720502
surpassed -9.34998386880959
gotton -9.341507128946896
flooring -9.341507128946896
700p -9.341507128946896
n70 -9.357834022234325
favourite -9.34895721420149
str-de898/b -9.380306878086385
plated -9.380306878086385
miscalculated -9.380306878086385
needle -9.37720491659267
hulk -9.380306878086385
tank -9.380306878086385
private -9.34573816084776
ryan -9.380306878086385
gunfight -9.380306878086385
distinctly -9.380306878086385
carefully -9.355053152811534
codecs -9.378182607505304
converted -9.359459774502666
archos -9.352649305500139
330 -9.138239434154684
progam -9.36480269155042
emergenies -9.36480269155042
hung -9.36480269155042
abroad -9.38897366

assisted -9.376526155246479
training -9.376526155246479
aggravating -9.376526155246479
woe -9.376526155246479
unpowered -9.376526155246479
unamplified -9.376526155246479
on-off -9.372584831992473
heartbeat -9.372584831992473
weighing -9.376156264360072
gram -9.376156264360072
liek -9.376156264360072
ringtone -9.376156264360072
ringtones -9.376156264360072
phillips -9.373101494365114
co -9.373101494365114
fallen -9.373101494365114
sliding -9.3185602271035
glue -9.373101494365114
readjusting -9.373101494365114
changin -9.338415936377224
lenmar -9.32146637806345
faulty -9.34556392964251
refund -9.267399156793175
10in -9.370256542232882
2hr -9.370256542232882
***nice -9.370256542232882
battery*** -9.370256542232882
likethe -9.376526155246479
transformer -9.365311890644863
amps.. -9.376526155246479
shoulld -9.376526155246479
plug.. -9.376526155246479
restraint -9.376526155246479
wedge -9.376526155246479
backin -9.376526155246479
tie -9.376526155246479
brick-style -9.38414565439355
digging -

hitting -9.392424741314276
'space -9.392424741314276
tap -9.38928502130961
godsend -9.392424741314276
d-pads -9.392424741314276
ne -9.392424741314276
atari -9.392424741314276
accustom -9.392424741314276
wasd -9.392424741314276
nostromo -9.38928502130961
immerse -9.392424741314276
great..and -9.351122587646339
sweat -9.308562973227543
9-12 -9.351122587646339
pearl -9.32658147873022
8100 -9.32658147873022
esmartbuy -9.32658147873022
re-image -9.372043852806978
norton -9.372043852806978
ghost -9.372043852806978
gx620 -9.349054334582279
cat6 -9.372043852806978
obtains -9.374955063014436
maintain -9.34243091654146
satilite -9.359206706046297
warms -9.359206706046297
camcorder/digital -9.368175376029058
7050 -9.368175376029058
swicth -9.341507128946896
di-704 -9.341507128946896
isp -9.368175376029058
slow-response -9.368175376029058
10d -9.32146637806345
30d -9.2524735065765
s40 -9.32146637806345
1600 -9.32146637806345
jpgs -9.32146637806345
1270 -9.32658147873022
1900 -9.379574008870732
var

8825. -9.395574350217172
unsatisfactory -9.395574350217172
kx-tg6700b -9.395574350217172
chintzy -9.395574350217172
responsibility -9.395574350217172
consolidating -9.395574350217172
890 -9.395574350217172
disappointment -9.395574350217172
fooling -9.395574350217172
juggle -9.395574350217172
cypress -9.395574350217172
at2lp -9.395574350217172
rc42 -9.395574350217172
drive- -9.395574350217172
pickup -9.395574350217172
cs50 -9.395574350217172
invariably -9.395574350217172
exposed -9.395574350217172
window/door -9.395574350217172
cubicle -9.395574350217172
deteriorates -9.395574350217172
ntsc -9.395574350217172
30-ohm -9.395574350217172
ineffective -9.395574350217172
shielding -9.395574350217172
low-conductance -9.395574350217172
wreck -9.395574350217172
chromatic -9.395574350217172
aberration -9.395574350217172
off-color -9.395574350217172
blotching -9.395574350217172
highlight -9.395574350217172
murky -9.395574350217172
shadow -9.395574350217172
mismatching -9.395574350217172
bleeding -

insistent -9.395574350217172
refurbish -9.395574350217172
thereof -9.395574350217172
jumpdrives -9.395574350217172
jumpdrive -9.395574350217172
keychain -9.395574350217172
multiswitch -9.395574350217172
154.00 -9.395574350217172
crapped -9.395574350217172
wor -9.395574350217172
caramel -9.395574350217172
veiled -9.395574350217172
indistinct -9.395574350217172
abundant -9.395574350217172
stax -9.395574350217172
superclean -9.395574350217172
clinical -9.395574350217172
mdr-w20g -9.395574350217172
auditioning -9.395574350217172
-small -9.395574350217172
-recording -9.395574350217172
-hold -9.395574350217172
-bugs -9.395574350217172
corrupt -9.395574350217172
f***ed -9.395574350217172
-software -9.395574350217172
-mic -9.395574350217172
-device -9.395574350217172
ds-2200 -9.395574350217172
smoke -9.395574350217172
ironic -9.395574350217172
4.99 -9.395574350217172
3.99 -9.395574350217172
blackberry -9.395574350217172
roadtrip -9.395574350217172
drama -9.395574350217172
million -9.3955743502

radi -9.395574350217172
shutoff -9.395574350217172
bel -9.395574350217172
rx65 -9.395574350217172
selfcal -9.395574350217172
requiered -9.395574350217172
valentine -9.395574350217172
//www.radarbusters.com/ -9.395574350217172
www.007radardetectors.com -9.395574350217172
www.consumersearch.com -9.395574350217172
surrounding -9.395574350217172
fat -9.395574350217172
us.you -9.395574350217172
spinning -9.395574350217172
369.00 -9.395574350217172
7/19/05 -9.395574350217172
birth -9.395574350217172
93.00 -9.395574350217172
pissed -9.395574350217172
neet -9.395574350217172
especiatlly -9.395574350217172
1:1 -9.395574350217172
sunday -9.395574350217172
monday -9.395574350217172
blaming -9.395574350217172
shipping/warehouse/etc -9.395574350217172
skypephone -9.395574350217172
postponement -9.395574350217172
p525 -9.395574350217172
gsm -9.395574350217172
ce-based -9.395574350217172
alergic -9.395574350217172
symbian -9.395574350217172
sonyericsson -9.395574350217172
pre -9.395574350217172
proti

2.09. -9.395574350217172
v2.09 -9.395574350217172
non-microsoft -9.395574350217172
halfway -9.395574350217172
sector -9.395574350217172
unformattable -9.395574350217172
ornament -9.395574350217172
embroidered -9.395574350217172
'magellan -9.395574350217172
royal -9.395574350217172
sew -9.395574350217172
fleece -9.395574350217172
tuck -9.395574350217172
eye-catching -9.395574350217172
steal-me -9.395574350217172
monogrammed -9.395574350217172
millenium -9.395574350217172
replacement/trade -9.395574350217172
hl-1440 -9.395574350217172
realtime -9.395574350217172
2-500 -9.395574350217172
crisscrossing -9.395574350217172
50-mile -9.395574350217172
automotive -9.395574350217172
boiled -9.395574350217172
xm-radio -9.395574350217172
2730 -9.395574350217172
unacceptably -9.395574350217172
lookup -9.395574350217172
stick-on -9.395574350217172
compounding -9.395574350217172
positioning -9.395574350217172
beanbag -9.395574350217172
clamp -9.395574350217172
fixed-base -9.395574350217172
fabricate 

In [23]:
# 找出歸類錯誤的例子
preds = Mul_model.predict(X)
P = Mul_model.predict_proba(X)[:,1] # p(y = 1 | x)

In [24]:
# 只列出最糟的
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)


Most wrong positive review (prob = 0.45163005589367905, pred = 0.0):

The Sandisk 512 MB Secure Digitial Ultra II (SDSDH-512-901) sent to me was not the item I ordered.  I returned the item, unopened

Most wrong negative review (prob = 0.5614391528442423, pred = 1.0):

I like the HP 96 because it can be used in many HP printers that use the HP 98,  but its a better deal than the HP 98 because it contains almost twice as much ink. 

HP's ink cartridges provide excellent performance,  but I think they are over priced 

