In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('all')

from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.width', 200)

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[

In [2]:
books = pd.read_csv('Data/Books_rating.csv')

In [3]:
books.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [4]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB


In [5]:
books.shape

(3000000, 10)

In [6]:
books.isnull().sum()

Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
review/summary            407
review/text                 8
dtype: int64

In [52]:
books.dropna(subset=['review/text'], inplace=True)


In [53]:
books = books.drop('Price', axis=1)

KeyError: "['Price'] not found in axis"

In [9]:
books['Title'].nunique()

212403

In [65]:
random_books = pd.Series(books['review/text'].sample(4, random_state=7).values.tolist())
random_books_df = books[books['review/text'].isin(random_books)].reset_index(drop=True)

In [66]:
random_books

0    Curt Aubley has written the essential resource...
1    Being an avid reader of books and essays deali...
2    This is one in a set. The Hidden Years is very...
3    I got a kick out of reading this book. Bonner ...
dtype: object

In [67]:
random_books_df['review/text'].isnull().any()

False

In [68]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

In [69]:
def preprocess_sentences(text):
    text = text.lower()
    temp_sent =[]
    words = nltk.word_tokenize(text)
    tags = nltk.pos_tag(words)
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES: 
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
          
    finalsent = ' '.join(temp_sent)
    return finalsent

In [70]:
random_books_df['review/text'] = random_books_df['review/text'].apply(preprocess_sentences)

In [78]:
random_books_df

Unnamed: 0,Id,Title,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1596057459,BLACK REBELLION: Five Slave Revolts (Cosimo Cl...,AH0PCGDRMC55O,Boris Yasdnilkov,0/0,3.0,1357862400,New Light On Dark Times,avid reader book essay deal slavery find publi...
1,130953881,Tuning & Sizing NT Server,,,1/1,5.0,904780800,The most authoritative resource yet available ...,curt aubley write essential resource system ad...
2,471739022,Empire of Debt: The Rise of an Epic Financial ...,ABN5K7K1TM1QA,Dennis Littrell,13/17,4.0,1155600000,"Entertaining and fun, especially for cynical c...",get kick read book bonner wiggin lampoon audac...
3,373226365,The Hidden Years (Hide and Seek #1) (Harlequin...,A30H2335OM7RD6,"apoem ""apoem""",4/6,5.0,1002844800,A winner. Waiting for more,one set hidden year start lovely lawyer cassid...


In [71]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(random_books_df['review/text'])

In [72]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0808915 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03282287, 0.06564574, ..., 0.03282287, 0.02587793,
        0.03282287],
       [0.        , 0.        , 0.        , ..., 0.        , 0.1716883 ,
        0.        ]])

In [73]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [74]:
indices = pd.Series(random_books_df.index, index=random_books_df['review/text'])

In [75]:
indices = indices[~indices.index.duplicated(keep='last')]

In [76]:
def check_book(keyword):
    return [review for review in random_books_df['review/text'] if keyword in review]

In [79]:
check_book('avid')[:20]


['avid reader book essay deal slavery find publication particularly certainly add knowledge darkness slavery']