In [43]:
import warnings
warnings.filterwarnings("ignore")

from IPython.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

import os
#To mute cuda errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  

#https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
import bz2
import pickle
import _pickle as cPickle

model_folder = "models"
base_folder = os.getcwd()
print(base_folder)

model_path = os.path.join(base_folder, model_folder)
data_path = os.path.join(base_folder, 'data')

if not os.path.exists(model_path):
    os.makedirs(model_path)

D:\z_ML\ML_project\NLP\Recommendation_system_books_recommeder


In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', -1)

df = pd.read_csv('data/books.csv')

print(df.shape)
df.columns

(11127, 13)


Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', '  num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12'], dtype='object')

In [3]:
df.head(4)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry Potter #6),J.K. Rowling/Mary GrandPré,4.57,439785960,9780440000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Harry Potter #5),J.K. Rowling/Mary GrandPré,4.49,439358078,9780440000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry Potter #2),J.K. Rowling,4.42,439554896,9780440000000.0,eng,352,6333,244,11/1/2003,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harry Potter #3),J.K. Rowling/Mary GrandPré,4.56,043965548X,9780440000000.0,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,


## Using only selected columns

In [4]:
df = df[['bookID','title','authors','language_code','publisher', 'isbn', 'isbn13']]
df.head(2)

Unnamed: 0,bookID,title,authors,language_code,publisher,isbn,isbn13
0,1,Harry Potter and the Half-Blood Prince (Harry Potter #6),J.K. Rowling/Mary GrandPré,eng,Scholastic Inc.,439785960,9780440000000.0
1,2,Harry Potter and the Order of the Phoenix (Harry Potter #5),J.K. Rowling/Mary GrandPré,eng,Scholastic Inc.,439358078,9780440000000.0


In [5]:
df.isnull().values.any()

False

## Data cleaning

### Author

In [6]:
df['authors'] = df["authors"].apply(lambda x: ','.join([str(elem) for elem in x.split("/")]))
df.head(2)

Unnamed: 0,bookID,title,authors,language_code,publisher,isbn,isbn13
0,1,Harry Potter and the Half-Blood Prince (Harry Potter #6),"J.K. Rowling,Mary GrandPré",eng,Scholastic Inc.,439785960,9780440000000.0
1,2,Harry Potter and the Order of the Phoenix (Harry Potter #5),"J.K. Rowling,Mary GrandPré",eng,Scholastic Inc.,439358078,9780440000000.0


In [7]:
df['publisher'].value_counts()

Vintage                        318
Penguin Books                  261
Penguin Classics               184
Mariner Books                  150
Ballantine Books               144
                              ... 
Westholme Publishing           1  
University of Calgary Press    1  
Marlowe & Company              1  
University Press of America    1  
Sounds True                    1  
Name: publisher, Length: 2293, dtype: int64

### Language

In [8]:
df['language_code'].value_counts()

eng            8908
en-US          1409
spa            218 
en-GB          214 
fre            144 
ger            99  
jpn            46  
mul            19  
zho            14  
grc            11  
por            10  
en-CA          7   
ita            5   
lat            3   
enm            3   
rus            2   
swe            2   
ara            1   
wel            1   
gla            1   
tur            1   
9.78085E+12    1   
nor            1   
9.78156E+12    1   
glg            1   
msa            1   
nl             1   
9.78159E+12    1   
srp            1   
ale            1   
Name: language_code, dtype: int64

In [9]:
lang_list = ['en-US', 'en-GB', 'en-CA']
new_value = 'eng'

df['language_code'] = df['language_code'].replace(lang_list, new_value)
df['language_code'].value_counts()

eng            10538
spa            218  
fre            144  
ger            99   
jpn            46   
mul            19   
zho            14   
grc            11   
por            10   
ita            5    
enm            3    
lat            3    
rus            2    
swe            2    
wel            1    
9.78085E+12    1    
gla            1    
nor            1    
tur            1    
srp            1    
glg            1    
msa            1    
9.78156E+12    1    
9.78159E+12    1    
nl             1    
ara            1    
ale            1    
Name: language_code, dtype: int64

#### Droping all non english books

In [10]:
df[df['language_code'] != 'eng'].index

Int64Index([   50,   109,   146,   150,   201,   223,   255,   256,   257,   259,
            ...
            11112, 11113, 11114, 11115, 11116, 11117, 11118, 11119, 11120, 11126], dtype='int64', length=589)

In [11]:
df = df.drop(index=df[df['language_code'] != 'eng'].index)
df[df['language_code'] != 'eng'].index

Int64Index([], dtype='int64')

### Title

In [12]:
df['title'][0:20].values

array(['Harry Potter and the Half-Blood Prince (Harry Potter  #6)',
       'Harry Potter and the Order of the Phoenix (Harry Potter  #5)',
       'Harry Potter and the Chamber of Secrets (Harry Potter  #2)',
       'Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)',
       'Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5)',
       'Unauthorized Harry Potter Book Seven News: "Half-Blood Prince" Analysis and Speculation',
       'Harry Potter Collection (Harry Potter  #1-6)',
       "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)",
       "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)",
       "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)",
       "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)",
       "The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy  #1-5)",
       'A Short History of Nearly

In [13]:
import re
def clean_data(sentence):
    
    #Remove @ sign and the characters followed by @sign
    sentence = re.sub('http://\S+|https://\S+', '', sentence)
    
    sentence = re.sub("\n|\r|'","",sentence)
    
    #Keep only numbers, text and %
    sentence  = re.sub('[^A-Za-z0-9-]+', ' ', sentence)  
    
    #Remove exta space between words
    sentence = re.sub(' +', ' ', sentence)
    
    #fix wrong spellings and return
    #sentence = TextBlob(sentence).correct() 
    
    sentence = sentence.strip()
    return str(sentence).lower()


df['title_clean'] = df['title'].apply(clean_data)
df['title_clean'][0:15].values

array(['harry potter and the half-blood prince harry potter 6',
       'harry potter and the order of the phoenix harry potter 5',
       'harry potter and the chamber of secrets harry potter 2',
       'harry potter and the prisoner of azkaban harry potter 3',
       'harry potter boxed set books 1-5 harry potter 1-5',
       'unauthorized harry potter book seven news half-blood prince analysis and speculation',
       'harry potter collection harry potter 1-6',
       'the ultimate hitchhikers guide five complete novels and one story hitchhikers guide to the galaxy 1-5',
       'the ultimate hitchhikers guide to the galaxy hitchhikers guide to the galaxy 1-5',
       'the hitchhikers guide to the galaxy hitchhikers guide to the galaxy 1',
       'the hitchhikers guide to the galaxy hitchhikers guide to the galaxy 1',
       'the ultimate hitchhikers guide hitchhikers guide to the galaxy 1-5',
       'a short history of nearly everything',
       'bill brysons african diary',
       '

#### Checking for null in title

In [14]:
df[df['title_clean'] == ''].index

Int64Index([], dtype='int64')

#### Checking for duplicates

In [15]:
print(df.shape)
print(df[df[['title_clean']].duplicated() == True].shape)
df[df[['title_clean','authors']].duplicated() == True].shape

(10538, 8)
(801, 8)


(327, 8)

In [16]:
df[df[['title_clean']].duplicated() == True].sort_values('title_clean')

Unnamed: 0,bookID,title,authors,language_code,publisher,isbn,isbn13,title_clean
327,1068,1776,"Peter Stone,Sherman Edwards",eng,Penguin Books,140481397,9780140000000.0,1776
8551,32830,20 000 Leagues Under The Sea,"Pauline Francis,Jules Verne",eng,Evans Brothers,237526883,9780240000000.0,20 000 leagues under the sea
5736,21426,2nd Chance (Women's Murder Club #2),"James Patterson,Melissa Leo,Jeremy Piven,Andrew Gross",eng,Little Brown & Company,1594831165,9781590000000.0,2nd chance womens murder club 2
1538,5328,A Christmas Carol,Charles Dickens,eng,Penguin Books,1580495796,9781580000000.0,a christmas carol
3731,13504,A Clash of Kings (A Song of Ice and Fire #2),"George R.R. Martin,Roy Dotrice",eng,Random House Audio,073930870X,9780740000000.0,a clash of kings a song of ice and fire 2
2417,8811,A Clockwork Orange,Anthony Burgess,eng,Penguin Books,014027409X,9780140000000.0,a clockwork orange
6631,24923,A Confederacy of Dunces,"John Kennedy Toole,Walker Percy",eng,Penguin Classics,141182865,9780140000000.0,a confederacy of dunces
2914,10797,A Farewell to Arms,Ernest Hemingway,eng,MacMillan Publishing Company,684174693,9780680000000.0,a farewell to arms
9146,35666,A Hard Day's Write: The Stories Behind Every Beatles Song,Steve Turner,eng,HarperResource,62736981,9780060000000.0,a hard days write the stories behind every beatles song
7122,27303,A History of God: The 4 000-Year Quest of Judaism Christianity and Islam,Karen Armstrong,eng,Ballantine Books,345384563,9780350000000.0,a history of god the 4 000-year quest of judaism christianity and islam


In [18]:
df[df[['title_clean','authors']].duplicated() == True].sort_values('title_clean')

Unnamed: 0,bookID,title,authors,language_code,publisher,isbn,isbn13,title_clean
9146,35666,A Hard Day's Write: The Stories Behind Every Beatles Song,Steve Turner,eng,HarperResource,62736981,9780060000000.0,a hard days write the stories behind every beatles song
7122,27303,A History of God: The 4 000-Year Quest of Judaism Christianity and Islam,Karen Armstrong,eng,Ballantine Books,345384563,9780350000000.0,a history of god the 4 000-year quest of judaism christianity and islam
11006,45149,A Painted House,John Grisham,eng,Arrow Books,99416158,9780100000000.0,a painted house
5124,18521,A Room of One's Own,Virginia Woolf,eng,Penguin Books,141183535,9780140000000.0,a room of ones own
7941,30410,A Secret Splendor,"Erin St. Claire,Sandra Brown",eng,Mira (Mills and Boon),1551660954,9781550000000.0,a secret splendor
1161,3870,A Short History of Nearly Everything,Bill Bryson,eng,Broadway Books,767923227,9780770000000.0,a short history of nearly everything
1893,6748,A Supposedly Fun Thing I'll Never Do Again: Essays and Arguments,David Foster Wallace,eng,Back Bay Books,316925284,9780320000000.0,a supposedly fun thing ill never do again essays and arguments
4773,17170,A Tale of Two Cities,"Charles Dickens,Gillen D'Arcy Wood",eng,Barnes Noble Classics,1593083327,9781590000000.0,a tale of two cities
600,1959,A Tale of Two Cities,"Charles Dickens,Gillen D'Arcy Wood",eng,Barnes Noble Classics,1593080557,9781590000000.0,a tale of two cities
10523,42878,A Walk in the Woods: Rediscovering America on the Appalachian Trail,Bill Bryson,eng,Bantam Doubleday Dell Publishing Group,767902513,9780770000000.0,a walk in the woods rediscovering america on the appalachian trail


#### Dropping all the tiles that has just number(non english)

In [19]:
df[df['title_clean'].str.isdigit()].index

Int64Index([326, 327, 1586, 4719], dtype='int64')

In [20]:
#https://stackoverflow.com/questions/62906472/remove-rows-from-pandas-dataframe-if-string-has-only-numbers
df = df[~df['title_clean'].str.isdigit()]

In [21]:
df[df['title_clean'].str.isdigit()].index

Int64Index([], dtype='int64')

#### Dropping all duplicated titles

In [22]:
df = df.drop_duplicates('title_clean', keep='last')
df.shape

(9734, 8)

In [23]:
print(df[df[['title_clean']].duplicated() == True].shape)

(0, 8)


### combining Authors and title columns

In [24]:
df.reset_index(inplace=True)

In [25]:
df['title_authors'] = + df['title_clean'] + df['authors'] 
df['title_authors'].head(2)

0    harry potter and the order of the phoenix harry potter 5J.K. Rowling,Mary GrandPré
1    harry potter boxed set books 1-5 harry potter 1-5J.K. Rowling,Mary GrandPré       
Name: title_authors, dtype: object

In [26]:
#new = df.drop(columns=['title','authors'])
books_df = df.copy()
books_df['title_authors'].head(2)

0    harry potter and the order of the phoenix harry potter 5J.K. Rowling,Mary GrandPré
1    harry potter boxed set books 1-5 harry potter 1-5J.K. Rowling,Mary GrandPré       
Name: title_authors, dtype: object

## Using the sentence transformer to create embedding 

In [27]:
from sentence_transformers import SentenceTransformer,util
sent_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.']

embeddings = sent_transformer_model.encode(sentences)

for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
    
emb1 = sent_transformer_model.encode("I am eating Apple")
emb2 = sent_transformer_model.encode("I like fruits")
print(emb1)

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173422e-02 -4.28515524e-02 -1.56286303e-02  1.40537536e-02
  3.95538025e-02  1.21796302e-01  2.94333193e-02 -3.17524150e-02
  3.54959927e-02 -7.93139860e-02  1.75878443e-02 -4.04369161e-02
  4.97259870e-02  2.54912600e-02 -7.18700439e-02  8.14968869e-02
  1.47070654e-03  4.79627885e-02 -4.50336076e-02 -9.92174894e-02
 -2.81769559e-02  6.45045489e-02  4.44670320e-02 -4.76217307e-02
 -3.52952480e-02  4.38671373e-02 -5.28565906e-02  4.33030975e-04
  1.01921506e-01  1.64072253e-02  3.26996259e-02 -3.45986933e-02
  1.21339215e-02  7.94870853e-02  4.58342116e-03  1.57778915e-02
 -9.68206488e-03  2.87626106e-02 -5.05806170e-02 -1.55793978e-02
 -2.87906677e-02 -9.62283742e-03  3.15556675e-02  2.27349307e-02
  8.71449634e-02 -3.85027267e-02 -8.84718373e-02 -8.75497889e-03
 -2.12343764e-02  2.08923724e-02 -9.02077556e-02 -5.25732711e-02
 -1.05638495e-02  2.88311169e-02 -1.61455218e-02  6.17837859e-03
 -1.23234

In [35]:
sent_vector = books_df['title_authors'].apply(sent_transformer_model.encode)

### Applying Cosine similarity to get similarity between the text embeddings

In [36]:
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.5398]])


In [37]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = util.cos_sim(sent_vector,sent_vector)
similarity

tensor([[ 1.0000,  0.7769,  0.4740,  ...,  0.1004,  0.2130,  0.0647],
        [ 0.7769,  1.0000,  0.4630,  ..., -0.0045,  0.1702,  0.0040],
        [ 0.4740,  0.4630,  1.0000,  ...,  0.0537,  0.1725,  0.0874],
        ...,
        [ 0.1004, -0.0045,  0.0537,  ...,  1.0000,  0.3921,  0.5149],
        [ 0.2130,  0.1702,  0.1725,  ...,  0.3921,  1.0000,  0.3424],
        [ 0.0647,  0.0040,  0.0874,  ...,  0.5149,  0.3424,  1.0000]])

## Testing the results

In [38]:
books_df[books_df['title'] == 'A Short History of Nearly Everything'].index[0]

913

In [39]:
def recommend(name):
    index = books_df[books_df['title'] == name].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(books_df.iloc[i[0]].title)
        
recommend('Harry Potter and the Chamber of Secrets (Harry Potter  #2)')

Harry Potter and the Sorcerer's Stone (Harry Potter  #1)
Harry Potter and the Order of the Phoenix (Harry Potter  #5)
Harry Potter and the Goblet of Fire (Harry Potter  #4)
Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)
Harry Potter and the Half-Blood Prince (Harry Potter  #6)


## Write our model and data to a pickle file

In [44]:
def write_pickle(obj, fileName):
    pickle_file_path = os.path.join(model_path, fileName)
    pickle.dump(obj, open(pickle_file_path, "wb"))
    
#write_pickle(books_df,'book_list.pkl')
#write_pickle(similarity,'similarity.pkl')

## https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
def compressed_pickle( data, title):
    with bz2.BZ2File(title + '.pbz2', 'w') as f: 
        cPickle.dump(data, f)
        
compressed_pickle(books_df,'book_list')
compressed_pickle(similarity,'similarity')

In [46]:
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

# Read our model file from compressed pickle
books_df = decompress_pickle('book_list.pbz2')
similarity = decompress_pickle('similarity.pbz2')