In [63]:
import pandas as pd
from tokenize import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michalpurtak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
stop_words = set(stopwords.words('english'))

In [64]:
books_df = pd.read_csv("books.csv")

In [56]:
def remove_stop_words(row):
    return [w for w in row['abstract'] if not w in stop_words and not w in string.punctuation]

In [27]:
def tokenize(row):
    return word_tokenize(row['abstract'])

In [28]:
def stemWords(row):
    porter = SnowballStemmer("english", ignore_stopwords=True)
    words = []
    for word in row['abstract']:
        words.append(porter.stem(word))
    return words

In [29]:
tokenized_abstracts = books_df.apply(tokenize, axis=1)

In [30]:
books_tokenized = list(zip(books_df['name'], tokenized_abstracts))

In [31]:
books_tokenized_df = pd.DataFrame({'name':books_df['name'], 'abstract':tokenized_abstracts})

In [12]:
books_tokenized_df.to_csv("books_tokenized.csv")

In [32]:
stemmed_abstracts = books_tokenized_df.apply(stemWords, axis = 1)

In [34]:
stemmed_abstracts[:5]

0    [1066, and, all, that, :, a, memor, histori, o...
1    [1066, and, all, that, :, a, memor, histori, o...
2    [2010, :, odyssey, two, is, a, 1982, scienc, f...
3    [a, crown, of, sword, (, abbrevi, as, aco, by,...
4    [publish, on, 15, april, 1755, and, written, b...
dtype: object

In [43]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stemmed_abstracts})

In [44]:
books_tokenized_stemmed_df[:5]

Unnamed: 0,name,abstract
0,1066 and All That,"[1066, and, all, that, :, a, memor, histori, o..."
1,1066 and All That: A Memorable History of Eng...,"[1066, and, all, that, :, a, memor, histori, o..."
2,2010: Odyssey Two,"[2010, :, odyssey, two, is, a, 1982, scienc, f..."
3,A Crown of Swords,"[a, crown, of, sword, (, abbrevi, as, aco, by,..."
4,A Dictionary of the English Language,"[publish, on, 15, april, 1755, and, written, b..."


In [57]:
stopwords_removed_abstracts = books_tokenized_stemmed_df.apply(remove_stop_words, axis=1)

In [58]:
stopwords_removed_abstracts[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
dtype: object

In [59]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stopwords_removed_abstracts})

In [62]:
books_tokenized_stemmed_df.to_csv("books_tokenized_stemmed.csv")

In [92]:
search_index = dict()

In [69]:
book_names = books_tokenized_stemmed_df['name']

In [70]:
def fillIndexWithWordsFromAbstract(row):
    for word in row['abstract']:
        if word not in search_index:
            search_index[word] = dict()
        if row['name'] not in search_index:
            search_index[word][row['name']] = 1
        else:
            search_index[word][row['name']] += 1

In [90]:
no_documents = len(book_names)

In [91]:
documents = books_tokenized_stemmed_df['abstract']
documents[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
Name: abstract, dtype: object

In [None]:
number_of_doc = 0
for document in documents:
    for word in document:
        if

In [89]:
nothing = books_tokenized_stemmed_df.apply(fillIndexWithWordsFromAbstract, axis=1)

In [84]:
for key in search_index:
    print(key)

1066
memor
histori
england
compris
part
rememb
includ
103
good
thing
5
bad
king
2
genuin
date
tongue-in-cheek
rework
written
w.
c.
sellar
r.
j.
yeatman
illustr
john
reynold
first
appear
serial
punch
magazin
publish
book
form
methuen
co.
ltd.
1930
2010
odyssey
two
1982
scienc
fiction
novel
arthur
clark
sequel
1968
2001
space
continu
stori
stanley
kubrick
's
film
adapt
titl
rather
origin
differ
respect
set
year
plot
center
joint
soviet-american
mission
aboard
soviet
spacecraft
leonov
sever
object
salvag
spaceship
discoveri
investig
mysteri
``
monolith
''
discov
dave
bowman
nomin
hugo
award
best
1983
screen
peter
hyam
releas
1984
crown
sword
abbrevi
aco
fan
seventh
wheel
time
fantasi
seri
american
author
robert
jordan
tor
may
15
1996
consist
prologu
41
chapter
april
1755
samuel
johnson
dictionari
english
languag
sometim
among
influenti
dissatisfact
period
june
1746
group
london
booksel
contract
write
sum
1,500
guinea
£1,575
equival
£220,000
2017
took
near
nine
complet
work
although
claim


need
expens
equip
compet
relay
team
trace
olymp
776
bce
format
defin
western
north
level
feder
backbon
summer
foremost
iaaf
championship
incorpor
marathon
disabl
paralymp
ipc
ἀθλητής
athlētē
ἆθλον
athlon
prize
ἆθλος
athlo
i.e
feat
narrow
definit
came
promin
furthermor
romanc
relat
similar
synonym
usag
rare
racewalk
starman
jone
want
son
abbi
hoffman
countercultur
sixti
quarter
youth
activist
activ
advic
histoir
d'o
ipa
istwaʁ
erot
desclo
paulin
réage
jean-jacqu
pauvert
forti
jean
paulhan
admir
marqui
sade
share
latter
submiss
stranger
valentin
michael
born
martian
interact
with—and
transform
of—terrestri
exodus
2:22
heret
got
virginia
brainstorm
jungl
1894
instead
wolv
putnam
220,000-word
160,067
uncut
manuscript
disagre
superior
telegraphes
2012
congress
88
stuart
1945
e.
award-win
garth
″look
mous
everi
way″
andromeda
strain
1969
crichton
techno-thril
effort
outbreak
extraterrestri
microorgan
arizona
taocp
comprehens
monograph
donald
knuth
algorithm
project
conceiv
seven-volum
paperb

elf
aidan
rhiyana
crusad
hattin
leper
jerusalem
baldwin
saladin
albertin
1886
krohg
norway
christiania
unmarri
seamstress
prostitut
confisc
1888
suprem
upheld
pay
nok
trett
1885
politilægen
venteværels
1887
marcel
disparu
à
recherch
temp
perdu
fugit
sweet
cheat
jubi
smither
columbia
macleod
amazon/book
near-mainstream
brandon
dewild
patrisha
mclean
six-issu
casey
reev
gian
fernando
brian
brinle
anvit
randeria
cirqu
studio
pericl
arcana
superhero
supervillain
irrevoc
lose
1960
leo
herlihi
frankenheim
wayson
choy
peoni
giller
vancouv
chen
kiam-kim
oldest
jook-liang
jung-sum
sek-lung
adolesc
care
chinatown
amaz
grace
random
seventy-third
ambassador
sr.
ottoman
resign
woodrow
ghostwrit
burton
hendrick
comparison
file
extens
armenian
genocid
onset
americana
dispatch
hampton
delillo
unkind
crime-writ
inspector
wexford
13th
mwa
tree
armstrong
joel
fink
angler
cheney
vice
barton
gellman
misl
republican
high-rank
antarctica
antarct
station
apollyon
destroy
unleash
tim
lahay
jerri
b.
jenkin
mond

In [88]:
search_index['death']

{'Anne of Green Gables': 1,
 'Commentarii de Bello Gallico': 1,
 '(Commentaries on the Gallic War)': 1,
 'How Green Was My Valley': 1,
 'Mort': 1,
 'Northanger Abbey': 1,
 'Stranger in a Strange Land': 1,
 'The Book of the Courtier': 1,
 'The Hound of the Baskervilles': 1,
 'The Salmon of Doubt': 1,
 'L’Étranger': 1,
 'The Stranger or The Outsider': 1,
 'The Transmigration of Timothy Archer': 1,
 'The Wasp Factory': 1,
 'Wuthering Heights': 1,
 'The Great Betrayal': 1,
 'The Sands of Time': 1,
 'All Fall Down,': 1,
 'The Brandon deWilde Story': 1,
 'All Fall Down': 1,
 'Between Heaven and Hell': 1,
 'Blue Murder': 1,
 'Closing Time': 1,
 'Come as You Are: The Story of Nirvana': 1,
 'Death Star': 1,
 'Death in Venice': 1,
 'Der Tod in Venedig': 1,
 'Gangster': 1,
 'Joe Cinque’s Consolation: A True Story of Death, Grief and the Law': 1,
 'Man Overboard': 1}