In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('master_books.csv')
print(df.shape)
df.head()

(19707, 6)


Unnamed: 0,title,author,rating,description,language,isbn
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,"4.33 avg rating — 6,191,931 ratings",WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,0439023483 (ISBN13: 9780439023481)
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,"4.50 avg rating — 2,414,060 ratings",There is a door at the end of a silent corrido...,English,0439358078 (ISBN13: 9780439358071)
2,To Kill a Mockingbird,Harper Lee,"4.28 avg rating — 4,348,656 ratings",The unforgettable novel of a childhood in a sl...,English,
3,Pride and Prejudice,Jane Austen,"4.26 avg rating — 2,886,703 ratings",Alternate cover edition of ISBN 9780679783268S...,English,
4,"Twilight (Twilight, #1)",Stephenie Meyer,"3.59 avg rating — 4,815,701 ratings",About three things I was absolutely positive.\...,English,0316015849 (ISBN13: 9780316015844)


In [8]:
df['language'].value_counts()

English                                15960
Arabic                                   441
Spanish                                  278
French                                   220
German                                   160
Indonesian                               159
Portuguese                               114
Italian                                   90
Dutch                                     86
Turkish                                   77
Persian                                   54
Polish                                    41
Romanian                                  37
Malay                                     33
Czech                                     33
Bulgarian                                 31
Swedish                                   25
Japanese                                  24
Greek, Modern (1453-)                     21
Urdu                                      21
Russian                                   18
Bengali                                   17
Serbian   

In [9]:
df = df[df['language'] == 'English']
df.shape

(15960, 6)

In [10]:
df.isnull().sum()

title             0
author            0
rating            0
description     148
language          0
isbn           2754
dtype: int64

In [21]:
df['title'].value_counts()

The Gift                                                                                                                                                     6
The Stand                                                                                                                                                    5
Life in Outer Space                                                                                                                                          4
Whose God? Whose Land?: The Great Empires and The Making of the Modern Middle East (JUDAISM, CHRISTIANITY, ISLAM: THE THREE PILLARS OF MONOTHEISM Book 1)    4
Floors                                                                                                                                                       4
                                                                                                                                                            ..
Dragonfly                                     

In [24]:
seen = set()
def find_doubles(title, author, seen=seen):
    title_author = (title, author)
    if title_author not in seen:
        seen.add(title_author)
        return False
    else:
        return True

In [27]:
df['double'] = df.apply(lambda x: find_doubles(x.title, x.author), axis=1)

In [28]:
df['double'].value_counts()

False    12415
True      3545
Name: double, dtype: int64

In [32]:
mask = ~df['double']
df = df[mask]

In [40]:
df.isnull().sum()

title             0
author            0
rating            0
description     104
language          0
isbn           2025
double            0
dtype: int64

In [41]:
df = df.dropna(subset=['description'])
df.shape

(12311, 7)

In [42]:
import re
text = df.loc[0, 'description'].lower()
re.sub('\W+', ' ', text)

'winning means fame and fortune losing means certain death the hunger games have begun in the ruins of a place once known as north america lies the nation of panem a shining capitol surrounded by twelve outlying districts the capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual hunger games a fight to the death on live tv sixteen year old katniss everdeen regards it as a death sentence when she steps forward to take her sister s place in the games but katniss has been close to dead before and survival for her is second nature without really meaning to she becomes a contender but if she is to win she will have to start making choices that weight survival against humanity and life against love '

In [63]:
df['text'] = [re.sub('\W+', ' ', text.lower()) for text in df['description']]

In [11]:
import spacy

In [64]:
nlp = spacy.load('en_core_web_sm')

In [65]:
text = df.loc[:5, 'description']
text = ''.join(text).replace('\r', '').replace('\n', '')
text

'WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister\'s place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.There is a door at the end of a silent corridor. And it’s haunting Harry Pottter’s dreams. Why else would he be waking in the middle of the night, screaming 

In [66]:
from spacy.tokenizer import Tokenizer

tokenizer = Tokenizer(nlp.vocab)

In [67]:
def make_tokens(text, tokenizer=tokenizer):
    return [token.text for token in tokenizer(text) if token.is_stop == False] 

In [68]:
df['tokens'] = df['text'].apply(make_tokens)
df['tokens']

0        [winning, means, fame, fortune, losing, means,...
1        [door, end, silent, corridor, s, haunting, har...
2        [unforgettable, novel, childhood, sleepy, sout...
3        [alternate, cover, edition, isbn, 978067978326...
4        [things, absolutely, positive, edward, vampire...
                               ...                        
15589    [tradition, bury, heart, wounded, knee, stunni...
15590    [ , woman, year, history, karyukai, come, forw...
15591    [true, story, individual, s, struggle, self, i...
15592    [jack, reacher, hero, loner, soldier, soldier,...
15593    [won, fame, freedom, gory, pits, rome, s, colo...
Name: tokens, Length: 12311, dtype: object

In [58]:
%%timeit

tokens = []

for doc in tokenizer.pipe(df['text'], batch_size=100):
    
    doc_tokens = [token.text for token in doc if token.is_stop == False]
    
    tokens.append(doc_tokens)

6.32 s ± 176 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
%%timeit
df['tokens'] = df['text'].apply(make_tokens)

6.71 s ± 249 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
df['tokens']

0        [WINNING, MEANS, FAME, FORTUNE, LOSING, MEANS,...
1        [door, end, silent, corridor, s, haunting, Har...
2        [unforgettable, novel, childhood, sleepy, Sout...
3        [Alternate, cover, edition, ISBN, 978067978326...
4        [things, absolutely, positive, Edward, vampire...
                               ...                        
15589    [tradition, Bury, Heart, Wounded, Knee, stunni...
15590    [ , woman, year, history, karyukai, come, forw...
15591    [true, story, individual, s, struggle, self, i...
15592    [Jack, Reacher, Hero, Loner, Soldier, Soldier,...
15593    [won, fame, freedom, gory, pits, Rome, s, Colo...
Name: tokens, Length: 12311, dtype: object

In [None]:
df['lemmas'] = [token.lemma_ for token in ]

In [19]:
[token.lemma_ for token in doc if token.pos_ == 'VERB']

['mean',
 'mean',
 'have',
 'know',
 'lie',
 'shine',
 'surround',
 'keep',
 'force',
 'send',
 'participate',
 'regard',
 'step',
 'take',
 'mean',
 'become',
 'win',
 'will',
 'start',
 'make',
 'weight',
 '’',
 'would',
 'wake',
 'scream',
 'poison',
 'loom',
 'pale',
 'grow',
 'Must',
 'name',
 'can',
 'stop',
 'must',
 'discover',
 'depend',
 'rock',
 'kill',
 'become',
 'publish',
 'go',
 'win',
 'make',
 'win',
 'move',
 'kill',
 'take',
 'translate',
 'claim',
 'consider',
 'regard',
 'remain',
 'call',
 'appear',
 'sparkle',
 'dance',
 'make',
 'know',
 'may',
 'thirst',
 'hold',
 'will',
 'change',
 'pick',
 'hide',
 'leave',
 'begin',
 'play',
 'learn',
 'read',
 'steal',
 'find',
 'hide',
 'open',
 'close',
 'craft',
 'write',
 'burn',
 'win',
 'give',
 'publish']

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
vect = TfidfVectorizer()

data = [' '.join(x) for x in df['tokens']]

vect.fit(data)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [74]:
sparse = vect.transform(data)

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())

dtm.head()

Unnamed: 0,00,000,001,007,01,0140422536,0141183829,02,020,03,...,धर,मपद,美しさと哀しみと,ﬁerce,ﬁnd,ﬁred,ﬁrst,ﬂame,ﬂavorful,ﬂavors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
from sklearn.neighbors import NearestNeighbors

In [81]:
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

In [82]:
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [85]:
new = 'A magical boy fights evil with his friends at school'
new_doc = make_tokens(new)
new_vected = vect.transform(new_doc)
new_df = pd.DataFrame(new_vected.todense(), vect.get_feature_names())
nn.kneighbors(new_df)

ValueError: Shape of passed values is (6, 59664), indices imply (59664, 59664)