### Load necessary library and data

In [238]:
import nltk
import numpy as np

In [239]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [240]:
wordnet_lemmatizer = WordNetLemmatizer()

In [241]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [242]:
# Load positive reviews
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


In [243]:
# Load negative reviews
negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


### Preprocess the data

In [244]:
# Preprocess string before training
# Lower character
# Remove any character shorter than 2 
# Lemmatizer all the character: dogs -> dog, cats -> cat, goes -> go
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [245]:
# Create a dictionary of all the words in the data
word_index_map = {}
current_index = 0

In [246]:
# Store all the token of all the comment
positive_tokenized = []
negative_tokenized = []

In [247]:
# Store all the words appear in the data into a list 
# and tokenized all the reviews in positive data  
for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [248]:
# Store all the words appear in the data into a list 
# and tokenized all the reviews in negative data 
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [249]:
# Apply word propotion method words-frequency
# Calculate the propotion words appear in data
# devide by the size of data 
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1       # Increase counter i
    x = x / x.sum()
    x[-1] = label
    return x

In [250]:
# N is the size of numpy array which contents all the vector of each review
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))

In [251]:
i = 0 # Set counter to 0
# Convent each group of token from each reviews
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

In [252]:
data.shape

(2000, 11093)

In [253]:
# Convert each group of token from each reviews
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [254]:
# Shuffle data for training 
np.random.shuffle(data)

In [255]:
print("Original size", data.shape)
print("Training size", data[:, :-1].shape)

Original size (2000, 11093)
Training size (2000, 11092)


In [256]:
X = data[:, :-1]    # all the row and the first 11093 column
Y = data[:, -1]     # all the row The last column

In [257]:
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

### Machine Learning Model

#### Logistic Regression

In [258]:
# Logistic Regression
LR = LogisticRegression()
LR.fit(Xtrain, Ytrain)
print("Logistic Regression classification rate:", LR.score(Xtest, Ytest))

Logistic Regression classification rate: 0.65


In [259]:
# Threshold value decide the Sentiment rate 
threshold = 0.5
for word, index in word_index_map.items():
    weight = LR.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

warranty -0.577320384044
feature 0.503702385763
fast 0.771839857476
've 0.67337619002
terrible -0.529478338588
month -0.805190715226
price 2.6637129792
comfortable 0.631453084567
ha 0.799258526513
lot 0.655052121207
sound 1.13800133476
home 0.525865732247
waste -0.919747631143
poor -0.759891452804
happy 0.660890632225
return -1.20421731491
love 1.21005478241
you 1.12447554264
customer -0.706681118369
bad -0.751199072748
quality 1.56718645934
look 0.585492569654
value 0.506814737099
space 0.53768361092
highly 1.03120148399
company -0.520721391629
memory 1.01231271583
pro 0.502628118489
item -0.9917285519
size 0.518584040261
week -0.713301599986
video 0.561755803343
recommend 0.717107897174
returned -0.812924300571
then -1.0764183491
bit 0.636573847156
tried -0.743941366671
time -0.659286765074
junk -0.555538268232
n't -1.95968693354
buy -0.78999630309
refund -0.622865568012
excellent 1.38544264181
perfect 0.964921260731
doe -1.18071397875
expected 0.579667816927
software -0.527650000256

In [260]:
LR.predict([Xtest[0]])

array([ 0.])

#### Naive Bayes

In [261]:
from sklearn.naive_bayes import MultinomialNB

In [262]:
NB = MultinomialNB()
NB.fit(Xtrain, Ytrain)
print("Naive Bayes classification rate:", NB.score(Xtest, Ytest))

Naive Bayes classification rate: 0.77


In [263]:
# Threshold value decide the Sentiment rate 
threshold = 0.5
for word, index in word_index_map.items():
    weight = NB.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

wdth -9.37245922145
rocked -9.39598971886
di-52 -9.39598971886
elite -9.39598971886
flexibe -9.39598971886
specification -9.31855166382
flying -9.32392186982
infrared -9.39598971886
t4800 -9.37493630966
puncture -9.38642026785
dlink -9.39598971886
browser -9.38930073071
configurator -9.38782640822
ruggedness -9.38755085022
imitation -9.39598971886
alwayz -9.39598971886
ten -9.35166509379
japan -9.34178149676
2500a -9.32188174671
warrnty -9.38506064833
cardstock -9.39598971886
equalizer -9.31777756075
trapped -9.38442889646
e3c -9.35509816154
automagically -9.37189216728
_maximum_ -9.39598971886
diagonally -9.39598971886
co-channel -9.39022601415
dangerous -9.3881465414
justice -9.37660249158
'break -9.39001955188
instant -9.36689539553
aac -9.38892255164
equipo -9.32757117495
leather -9.25676840862
1100 -9.39598971886
pres -9.39598971886
crazy -9.36310877572
course -9.16956964197
blurb -9.39149532927
nightly -9.39008999674
cd-r -9.21184257527
puppy -9.3881465414
s40 -9.32188174671
effe

closing -9.39598971886
tighted -9.39598971886
assign -9.38153366246
wasnt -9.27292962611
antiseptic -9.39288894118
smell -9.35153795629
copying -9.36911278212
glaring -9.39598971886
distinctly -9.38072224673
screen -8.30399691285
torn -9.36432355697
*lot* -9.36821015476
wonderfully -9.3495946215
soundblaster -9.23649619463
memorex -8.78850329417
'round -9.39395926131
site -9.08058193472
f***ed -9.39598971886
mdr-ex71 -9.39598971886
soft -9.28939090608
1st -9.21346146612
strength -9.35842708434
hotter -9.39598971886
escalated -9.39598971886
.company -9.39598971886
325 -9.39598971886
themselves -9.28887873761
controlling -9.39191639348
pink -9.24133761912
restocking -9.39598971886
flawed -9.35837285529
luck -9.28299192143
nice -8.30377855506
clone -9.38916375379
web -9.01733537637
auto-configuration -9.38930073071
excessive -9.39598971886
dismal -9.39598971886
complicated -9.34099614072
bordered -9.39598971886
earphone -8.77476825026
flawlessly -9.051926874
1gb -9.26262648584
differentia

toughskins -9.39598971886
stretched -9.26502932216
appears -9.31901672109
quick -8.96549204484
369.00 -9.39598971886
godsend -9.39284010996
router -8.78703163605
sennheiser -9.29220044877
e3s -9.35250460692
belive -9.38506064833
ilink -9.37189216728
pentium -9.38567784314
handle -9.23340676717
npr -9.38983585329
horribly -9.39117043243
product-wise -9.33536509705
'boomier -9.38755085022
sunny -9.3788952855
non-tech -9.39598971886
36.95 -9.38782640822
linkysys -9.36700218199
fullest -9.37067191088
price -7.06551924316
dependable -9.34719955469
1.25 -9.3842936791
administrator -9.39598971886
experiencing -9.39598971886
benefit -9.37617946571
day..radio -9.35962207469
lapdesk -9.35824939088
regarding -9.28333625175
120dpi -9.38623354392
radio.the -9.37859797615
list -9.08887057097
sixth -9.39598971886
series2 -9.39598971886
r200/r300/r320 -9.37972919799
italy -9.37578701155
freaked -9.39598971886
neatest -9.37493630966
rep -9.39188293691
foobar2000/lame -9.36932147178
rebranded -9.3865999

won`t -9.38882122938
utter -9.38755085022
adorama -9.36424102055
formed -9.35516772434
e20s -9.34946970323
resoldered -9.39598971886
asleep -9.3822908745
aug -9.39598971886
retainer -9.39598971886
understand -9.19483436758
3rd -9.28729311359
stuttering -9.39598971886
usage -9.19475634508
economical -9.38072224673
roadtrip -9.39598971886
pavilion -9.36328684878
kid -9.00740460444
overly -9.34869078588
rage -9.39598971886
ferret -9.38642026785
mx400 -9.39598971886
sarcasm -9.39598971886
it`s -9.38170376162
.surprise -9.39598971886
mach -9.39059887023
loc=3 -9.39076777488
smoker -9.39395926131
16mb -9.33302476217
reset -9.33087470821
palmtop -9.39598971886
tweaked -9.38975916911
gentle -9.39598971886
technology -9.28290071298
confidently -9.39598971886
firebird -9.39395926131
furniture -9.39598971886
broadside -9.38256669853
jiggling/adjusting -9.39598971886
kong -9.39202931765
happy -8.23019551904
cushioning -9.38506064833
desire -9.35851397614
win98se -9.39598971886
enthusiast -9.371701

surely -9.32142358454
rack -9.38143916235
orbit -9.37159826574
beneath -9.39598971886
weekly -9.39598971886
sleeve -9.30505053359
t-style -9.39598971886
drum-n-bass -9.39598971886
white/violet -9.39598971886
night -9.08966340178
crumbled -9.39598971886
pip -9.36594964264
writing -9.33492281863
month-old -9.39598971886
forums/blogs -9.39598971886
redial -9.35676900571
stale -9.39100217735
9-5pm -9.39598971886
4.0 -9.38659997851
mp3-cd -9.37797121336
overcome -9.38456102304
exhausted -9.39598971886
feasible -9.2906292032
aisle -9.39598971886
gilsson -9.3788952855
flawlessy -9.39598971886
decrease -9.35918185023
rivaling -9.37618709157
rush -9.39598971886
consummes -9.38356719886
engaging -9.39598971886
sweat -9.30897834187
hard-drives -9.39598971886
day-to-day -9.39598971886
hired -9.37618709157
miniaturized -9.39598971886
mid-sized -9.37189216728
annoying -9.17036149397
manually -9.33635390157
isqueez -9.36932147178
sportapro/portapro -9.39251145449
revolution -9.37830862861
reflector -

a** -9.39191639348
unanswered -9.39598971886
pessimistic -9.39598971886
value -8.56435095271
3/4 -9.37112584625
extension -9.10574523344
metro -9.39598971886
internet -8.97756808505
key-symbol -9.39598971886
bump -9.35811800659
gmrs -9.39598971886
hire -9.39598971886
expectation -8.98157443357
calculate -9.39598971886
extremely -8.9065855413
at -9.39598971886
gui -9.3809518415
plug/jack -9.39598971886
fan -8.99849536338
shoreline -9.39598971886
arrgghh -9.3499919526
dialtone -9.38274449211
uploading -9.39598971886
cry -9.39598971886
10,000 -9.38792186631
inconsistent -9.39598971886
windy -9.35962207469
6500b -9.37990358111
pretend -9.36089839905
'recently -9.39598971886
1.2.0 -9.39598971886
retracts -9.33536509705
beer -9.36013624225
failure -9.27930614294
dealbreaker -9.39598971886
data -8.72574236615
condenser -9.38415526122
inevitable -9.39288894118
stunned -9.39598971886
arrange -9.33801539473
magmount -9.39598971886
relavant -9.39598971886
hd-457 -9.38882122938
engine -9.35031536

differnt -9.39598971886
harley -9.39598971886
ohm -9.29229726031
resting -9.37300020064
verry -9.31594701119
dull -9.36322364168
saca -9.35824939088
mine -8.91799648896
realized -9.36041182179
video -8.2486398053
chose -9.09110269296
party -9.2819028683
curt -9.39598971886
moreover -9.39598971886
bind -9.39598971886
vague -9.39598971886
1100mah -9.38975916911
served -9.39598971886
cant -9.27845280676
2mg -9.38356719886
baffle -9.39598971886
half-second -9.39598971886
booming -9.39117043243
movie -8.80674666154
ourselves -9.38782640822
visiting -9.3842936791
hardware/software -9.39598971886
unfotunately -9.37998937752
500- -9.38938903483
into.xm -9.37859797615
recent -9.33999244217
2-3x -9.37829014176
incompatible -9.37288072435
fantasy -9.35828364937
kos -9.29162817749
generaly -9.36089839905
simply -9.09361614659
256 -9.26157088679
wpc54gs -9.38139091944
trasnfer -9.39117043243
toddler -9.32109598028
w/answering -9.36319989604
wasting -9.37744267796
overcast -9.39598971886
knock-offs 

chunk -9.39598971886
supportive -9.37351686301
irritant -9.39598971886
feed -9.35961104842
nuance -9.38308631403
pace -9.38200721815
ahold -9.39598971886
blew -9.35303909346
removeable -9.38603938801
710ul -9.39598971886
grinding -9.33145119773
charting -9.39598971886
interuppted -9.39598971886
skip -9.32475078129
slim -9.16857254152
supprised -9.38401352782
doing -8.90905659331
earth-shaking -9.38782640822
winzip -9.39598971886
swinging -9.39598971886
boo -9.39598971886
pixima -9.39598971886
shopped -9.3515786671
exterior -9.34135743302
attractiveness -9.39225139675
sychronized -9.39598971886
auction -9.38341093666
chart -9.39598971886
turrble -9.39598971886
keychain -9.39598971886
disable -9.39598971886
author -9.39598971886
goflight -9.37245922145
anyways -9.35227775476
pickup -9.39598971886
surprisingly -9.20740580434
notepad -9.32699684738
axim -9.23551670266
glowing -9.34511738849
hella -9.36859074467
36xbr800 -9.35962207469
belkin.com -9.38274449211
-cds -9.38642026785
steelbar 

sdsdh-1024-901 -9.34192249759
sunrocket -9.32699684738
delorme -9.38180508387
woowho -9.26245832624
canon -8.64494347459
lasting -9.29795790125
well-conceived -9.39598971886
speck -9.38356719886
can`t -9.38882122938
exorbitant -9.38855474038
razr -9.32794968213
tvhd -9.39598971886
padded..i -9.39598971886
tunedok -9.35078428209
w/a -9.38238406681
carted -9.38916375379
4750 -9.39598971886
maufacturer -9.39598971886
ss-mf650h -9.38681534259
vivaldi -9.39202931765
serving -9.37351686301
tight -9.18281111736
fianlly -9.39598971886
recomend -9.14636021095
dragging -9.39598971886
gal -9.39598971886
reliable..well -9.17284616755
tail -9.39598971886
guitar -9.30341622525
rock-bottom -9.39598971886
smashed -9.39598971886
strongly -9.18342118293
2003 -9.37133077952
~15 -9.39598971886
anyhow -9.35620722548
carr -9.10830764641
54mbs -9.39598971886
deserved -9.39598971886
ilove -9.39598971886
tunejuice -9.3546045027
s041727 -9.38072224673
undisclosed -9.39598971886
x-wing -9.38356719886
canopy -9.3

invalidate -9.39598971886
langsing -9.39251145449
jose -9.39598971886
domestic -9.39598971886
ipod -7.80605903226
v3.0_20 -9.38200347689
requires -9.30224318838
fvs338 -9.39598971886
unresponsive -9.39598971886
calender -9.37401081214
ashamed -9.38755085022
minor -9.04570190077
refresh -9.39255918377
sdsdqu-1024-e10m -9.32699684738
puede -9.36319989604
easiest -9.30067953906
scratched -9.28548026858
set-up -9.09229875289
touchpoints -9.39598971886
doorstop -9.39598971886
50mb -9.39598971886
distinct -9.39598971886
set -8.22325990302
flush -9.38341093666
include -9.36836536522
wheel -9.15945278318
3.11 -9.38755085022
impact -9.3695940606
misnamed -9.39598971886
asus -9.38882122938
undone -9.39598971886
m105 -9.37797121336
frankly -9.36428065857
phone/pda -9.39598971886
entry -9.32186746897
received -8.82592482836
cx2610 -9.37694152389
exercising -9.28544037634
caught -9.33766096145
documentation -9.38782481174
auto-routing -9.38493988268
flutuates -9.39598971886
complaint..very -9.37657

tray -9.34653904121
btw -9.34582734872
advice -9.33075715188
ccradio -9.38755085022
roller -9.37680161581
earpeice -9.39598971886
knob -9.08832621184
shortcut -9.39598971886
inform -9.39598971886
encouraged -9.39598971886
hotas -9.32188174671
loss -9.30711042939
slight -9.28024445661
hn110 -9.39598971886
consolidate -9.39598971886
medialife -9.38849904713
9-12 -9.35153795629
ignition -9.39598971886
intutive -9.39598971886
com -9.39598971886
obtaibed -9.36424102055
reformating -9.39598971886
reporting -9.39598971886
alows -9.39117043243
dialog-301 -9.39598971886
announced -9.37797121336
brookstone -9.39598971886
proven -9.33256789243
fooling -9.39598971886
brian -9.37522772741
omen -9.38983585329
popular -9.32112868098
intruder -9.39598971886
full- -9.385518419
inconsistently -9.39598971886
esactly -9.31594701119
legal -9.39598971886
hear -8.74697500912
safe -9.26829957972
recently -9.03716462191
insistent -9.39598971886
planed -9.10830764641
mission -9.39598971886
amozon -9.39598971886

except -9.03770215901
decreased -9.38651097491
affect -9.3734663834
naivete -9.39598971886
sufficiently -9.34946970323
based -9.14471556964
hide -9.39225139675
failure/malfunctioning -9.39598971886
regrettably -9.39598971886
475. -9.36150354279
soundcards -9.36859074467
p100 -9.39598971886
hard-nosed -9.39598971886
'intelligent -9.39598971886
mmorpg -9.39598971886
email-friendly -9.39598971886
cite -9.38892255164
pitched -9.39598971886
animated -9.39177918633
opt -9.37578701155
operation -9.17166877114
wrist -9.13517915884
visually -9.38341093666
hardtime -9.38469016361
total -9.26870910841
cleaner -9.33644237018
softly -9.39598971886
locally -9.20304106949
adjustment -9.20403865696
fly -9.34584567385
faster -9.06115139818
occuring -9.39598971886
altough -9.33844676358
8.4 -9.39139200961
bomb -9.37797121336
mazatlan -9.39598971886
stellar -9.36319989604
xtrememac -9.39598971886
rhinoskin -9.39598971886
emptying -9.36319989604
chip-hardly -9.39598971886
1.11 -9.38755085022
misfeeds -9.3

#### Support Vector Machine

In [59]:
from sklearn import svm

In [60]:
SVM = svm.LinearSVC()
SVM.fit(Xtrain, Ytrain)
print("SVN classification rate:", SVM.score(Xtest, Ytest))

SVN classification rate: 0.76


In [61]:
# Threshold value decide the Sentiment rate 
threshold = 0.5
for word, index in word_index_map.items():
    weight = SVM.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

delivery 0.707575576141
plastic -0.526746993601
else -0.974301564702
warranty -1.45977819018
coming -0.563607482212
microphone -0.644405746512
pleased 0.977389268192
feature 1.26962060285
fast 2.27555729567
cheap -1.31394102315
poorly -0.640271194616
coolpix -0.561564752039
've 1.7625169274
avoid -0.618396555619
traveling 0.636372596205
output 0.638732150808
read -0.590440272441
screen 0.733328547806
bass 0.545538630408
carry 0.581303843087
satisfied 0.949789031497
terrible -1.63880808656
died -1.07947731139
freeze -0.504868785201
completely -0.513969648592
response -0.602881862156
awesome 1.03392809279
nicely 0.539644581335
car 0.69400667755
handy 0.992275092792
plus 0.808180620336
sent -1.20170773064
gps 0.560876919498
apple -0.793990110334
replace -0.581509302097
pay -1.02886561128
jack -0.645161209363
month -1.45372449112
router -0.572199935589
handle -0.546633277542
price 4.35464170334
port 0.646460360134
expensive 0.781970193723
disappointed -1.48170848434
card -0.797153352425
po

#### Decision Tree

In [63]:
from sklearn import tree

In [64]:
DT = tree.DecisionTreeClassifier()
DT.fit(Xtrain, Ytrain)
print("Decision Tree classification rate:", DT.score(Xtest, Ytest))

Decision Tree classification rate: 0.69


### Test with real data
The prediction is 1 mean happy, 0 mean not

In [224]:
string_text = "It is and does exactly what the description said it would be and would do. Couldn't be happier with it."

In [225]:
string_token = my_tokenizer(string_text)

In [149]:
string_token

['doe', 'exactly', 'description', "n't", 'happier']

In [150]:
word_index_map['able']

432

In [151]:
vector_test = tokens_to_vector(string_token, 1)

In [152]:
vector_test

array([ 0.,  0.,  0., ...,  0.,  0.,  1.])

In [153]:
array_string_test = np.zeros((N, len(word_index_map) + 1))

In [154]:
array_string_test = vector_test

In [155]:
array_string_test = array_string_test[:-1]

In [156]:
array_string_test.size

11092

In [157]:
DT.predict([array_string_test])

array([ 1.])

### Apply Gensim Word2vec

In [161]:
import gensim
from gensim import corpora, models, similarities

In [215]:
sum_tokenized = positive_tokenized + negative_tokenized

In [234]:
model = gensim.models.Word2Vec(sum_tokenized, min_count=5, size= 50)

In [232]:
model['eat']

array([-0.03081443,  0.01060508,  0.07299113,  0.17690854, -0.1239821 ,
        0.04745823, -0.14856771, -0.0002048 ,  0.0836578 , -0.07962631,
        0.15526682,  0.02971914, -0.16316576, -0.01894819, -0.04032685,
        0.09709319,  0.01426258, -0.07769202, -0.01097766,  0.14594196,
        0.06261004, -0.14477016, -0.06880755, -0.0781102 , -0.10719097,
       -0.12754311, -0.01752081,  0.15823597,  0.05587537,  0.13424858,
       -0.05810081,  0.0308728 ], dtype=float32)

In [199]:
N

2000

In [200]:
## New Word2vec
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] = model[t]
    x[-1] = label
    return x

In [190]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        print(t," value :", i)
        x[i] += 1
        print("value array", x[i])
    print('HI')
    x = x / x.sum()
    x[-1] = label
    return x

In [191]:
tokens_to_vector(positive_tokenized[1], 1)

apc  value : 37
value array 1.0
back-ups  value : 38
value array 1.0
500  value : 39
value array 1.0
recommendation  value : 40
value array 1.0
employee  value : 41
value array 1.0
mine  value : 42
value array 1.0
apc  value : 37
value array 2.0
've  value : 43
value array 1.0
month  value : 44
value array 1.0
've  value : 43
value array 2.0
functioned  value : 45
value array 1.0
properly  value : 46
value array 1.0
unexpected  value : 47
value array 1.0
power  value : 6
value array 1.0
interruption  value : 48
value array 1.0
'll  value : 49
value array 1.0
gladly  value : 50
value array 1.0
arises  value : 51
value array 1.0
pro  value : 52
value array 1.0
plug  value : 53
value array 1.0
spacing  value : 54
value array 1.0
power  value : 6
value array 2.0
adapter  value : 55
value array 1.0
simple  value : 56
value array 1.0
design  value : 57
value array 1.0
cord  value : 58
value array 1.0
con  value : 59
value array 1.0
line  value : 60
value array 1.0
conditioning  value : 61
va

array([ 0.,  0.,  0., ...,  0.,  0.,  1.])

In [229]:
model.most_similar('happy')

[('manual', 0.9998918771743774),
 ('pretty', 0.9998819828033447),
 ('return', 0.999875545501709),
 ('try', 0.9998737573623657),
 ('free', 0.9998694658279419),
 ('purchase', 0.9998689889907837),
 ('le', 0.9998672008514404),
 ('week', 0.9998651146888733),
 ('light', 0.999855637550354),
 ('amazon', 0.9998555183410645)]

In [235]:
model.most_similar('happy')

[('pretty', 0.9998424053192139),
 ('free', 0.99983811378479),
 ('overall', 0.9998345971107483),
 ('return', 0.9998335838317871),
 ('try', 0.9998320937156677),
 ('course', 0.9998219609260559),
 ('time', 0.9998193979263306),
 ('le', 0.9998171925544739),
 ('purchase', 0.9998167157173157),
 ('own', 0.9998062252998352)]