<b><h1>Books Recommendation System

### importing Required Libraries

In [4]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<b><h3>Dataset Loading

In [6]:
# Define the file paths
books_path = "/content/drive/MyDrive/Dataset/Books.csv"
users_path = "/content/drive/MyDrive/Dataset/Users.csv"
ratings_path = "/content/drive/MyDrive/Dataset/Book-Ratings.csv"

# Read the CSV files
books = pd.read_csv(books_path, delimiter=';', on_bad_lines='skip', encoding='ISO-8859-1')
users = pd.read_csv(users_path, delimiter=';', on_bad_lines='skip', encoding='ISO-8859-1')
ratings = pd.read_csv(ratings_path, delimiter=';', on_bad_lines='skip', encoding='ISO-8859-1')

print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Books-ratings: ", ratings.shape)


Books Data:     (271360, 8)
Users Data:     (278858, 3)
Books-ratings:  (1149780, 3)


<b><h3>Pre-processing

<b>Books Dataset Pre-processing

In [7]:
print("Columns: ", list(books.columns))
books.head()

Columns:  ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [8]:
## Drop URL columns
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [9]:
## Checking for null values
books.isnull().sum()

Unnamed: 0,0
ISBN,0
Book-Title,0
Book-Author,2
Year-Of-Publication,0
Publisher,2


In [12]:
books.loc[books['Book-Author'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
118033,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [13]:
books.loc[books['Publisher'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,


In [14]:
#Replacing NUll value with "other"

books.at[118033 ,'Book-Author'] = 'Other'
books.at[187689 ,'Book-Author'] = 'other'
books.at[128890 ,'Publisher'] = 'Other'
books.at[129037 ,'Publisher'] = 'Other'

In [15]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [16]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...


In [17]:
books.at[209538 ,'Publisher'] = 'DK Publishing Inc'
books.at[209538 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
books.at[221678 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'James Buckley'

books.at[220731 ,'Publisher'] = 'Gallimard'
books.at[220731 ,'Year-Of-Publication'] = '2003'
books.at[209538 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
books.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [18]:
## Converting year of publication in Numbers
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [19]:
print(sorted(list(books['Year-Of-Publication'].unique())))

[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]


In [20]:
## Replacing Invalid years with max year
count = Counter(books['Year-Of-Publication'])
[k for k, v in count.items() if v == max(count.values())]

[2002]

In [21]:
books.loc[books['Year-Of-Publication'] > 2021, 'Year-Of-Publication'] = 2002
books.loc[books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [22]:
# ## Uppercasing all alphabets in ISBN
books['ISBN'] = books['ISBN'].str.upper()

In [23]:
## Drop duplicate rows
print(books.drop_duplicates(keep='last', inplace=True))
books.reset_index(drop = True, inplace = True)

None


In [25]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


<b>Users Dataset Pre-processing

In [26]:
print("Columns: ", list(users.columns))
users.head()

Columns:  ['User-ID', 'Location', 'Age']


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [27]:
## Checking null values
print(users.isna().sum())

User-ID          0
Location         0
Age         110762
dtype: int64


In [28]:
## Check for all values present in Age column
print(sorted(list(users['Age'].unique())))

[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0

In [29]:
required = users[users['Age'] <= 80]
required = required[required['Age'] >= 10]

In [30]:
mean = round(required['Age'].mean())
mean

35

In [31]:
users.loc[users['Age'] > 80, 'Age'] = mean    #outliers with age grater than 80 are substituted with mean
users.loc[users['Age'] < 10, 'Age'] = mean    #outliers with age less than 10 years are substitued with mean
users['Age'] = users['Age'].fillna(mean)      # Replacing  null values with mean
users['Age'] = users['Age'].astype(int)       #changing Datatype to int

### Seprating city, state, and country from location column


In [32]:
list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0
count_no_country = 0

for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',':  #removing invalid entries too
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':   #removing invalid entries
            state.append('other')
            count_no_state += 1
        else:
            state.append(list_[i][1].lower())

        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())

users = users.drop('Location',axis=1)

temp = []
for ent in city:
    c = ent.split('/')
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

print(count_no_country)   #printing the number of countries didnt have any values
print(count_no_state)     #printing the states which didnt have any values

4659
16044


In [33]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278858 non-null  int64 
 1   Age      278858 non-null  int64 
 2   City     278858 non-null  object
 3   State    278858 non-null  object
 4   Country  278858 non-null  object
dtypes: int64(2), object(3)
memory usage: 10.6+ MB


In [34]:
users.head()

Unnamed: 0,User-ID,Age,City,State,Country
0,1,35,nyc,new york,usa
1,2,18,stockton,california,usa
2,3,35,moscow,yukon territory,russia
3,4,17,porto,v.n.gaia,portugal
4,5,35,farnborough,hants,united kingdom


<b>Books-Ratings Dataset Pre-processing

In [35]:
print("Columns: ", list(ratings.columns))
ratings.head()

Columns:  ['User-ID', 'ISBN', 'Book-Rating']


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [36]:
## Checking for null values
ratings.isnull().sum()

Unnamed: 0,0
User-ID,0
ISBN,0
Book-Rating,0


In [37]:
## checking all ratings number or not
print(is_numeric_dtype(ratings['Book-Rating']))

True


In [38]:
## checking User-ID contains only number or not
print(is_numeric_dtype(ratings['User-ID']))

True


In [39]:
## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

False


In [40]:
## removing extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = books['ISBN'].tolist()
reg = "[^A-Za-z0-9]"
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [41]:
## Uppercasing all alphabets in ISBN
ratings['ISBN'] = ratings['ISBN'].str.upper()

In [42]:
## Drop duplicate rows
ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [43]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149776 entries, 0 to 1149775
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149776 non-null  int64 
 1   ISBN         1149776 non-null  object
 2   Book-Rating  1149776 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [44]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


<h3><b>Merging of all three Tables

<b>Merging Books, Users and Rating Tables in One

In [45]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031609 entries, 0 to 1031608
Data columns (total 11 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   ISBN                 1031609 non-null  object
 1   Book-Title           1031609 non-null  object
 2   Book-Author          1031609 non-null  object
 3   Year-Of-Publication  1031609 non-null  int64 
 4   Publisher            1031609 non-null  object
 5   User-ID              1031609 non-null  int64 
 6   Book-Rating          1031609 non-null  int64 
 7   Age                  1031609 non-null  int64 
 8   City                 1031609 non-null  object
 9   State                1031609 non-null  object
 10  Country              1031609 non-null  object
dtypes: int64(4), object(7)
memory usage: 86.6+ MB


<b>Divide complete data on the basis of Implicit and Explicit ratings datasets

In [46]:
## Explicit Ratings Dataset
dataset1 = dataset[dataset['Book-Rating'] != 0]
dataset1 = dataset1.reset_index(drop = True)
dataset1.shape

(384074, 11)

In [47]:
## Implicit Ratings Dataset
dataset2 = dataset[dataset['Book-Rating'] == 0]
dataset2 = dataset2.reset_index(drop = True)
dataset2.shape

(647535, 11)

In [48]:
dataset1.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Age,City,State,Country
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,35,timmins,ontario,canada
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8,35,other,other,other
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8,30,toronto,ontario,canada
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9,35,ottawa,other,other
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9,35,kingston,ontario,canada


<h2><b>Recommendation Systems

<h5><b> 1. Popularity Based (Top In whole collection)

In [49]:
# Function to get the top 5 popular books
def popularity_based_top_5(dataframe, books_df):
    # Group by ISBN and count the number of ratings, sort in descending order, and get the top 5
    data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(5)


    data.reset_index(inplace=True)
    # Merge on 'ISBN'
    result = pd.merge(data, books_df, on='ISBN')
    return result

print("Top 5 Popular Books in the Collection:")
top_books = popularity_based_top_5(dataset1, books)
top_books


Top 5 Popular Books in the Collection:


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,316666343,707,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,971880107,581,Wild Animus,Rich Shapero,2004,Too Far
2,385504209,488,The Da Vinci Code,Dan Brown,2003,Doubleday
3,312195516,383,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA
4,60928336,320,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial


<h5><b>2. Popularity Based (Top In a given place)

In [50]:
def search_unique_places(dataframe, place):
    place = place.lower()

    if place in list(dataframe['City'].unique()):
        return dataframe[dataframe['City'] == place]
    elif place in list(dataframe['State'].unique()):
        return dataframe[dataframe['State'] == place]
    elif place in list(dataframe['Country'].unique()):
        return dataframe[dataframe['Country'] == place]
    else:
        return "Invalid Entry"

In [51]:
place = input("Enter the name of place: ")
data = search_unique_places(dataset1, place)

if isinstance(data, pd.DataFrame):
    data = popularity_based_top_5(dataset1, books)

data

Enter the name of place: Pakistan


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,316666343,707,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,971880107,581,Wild Animus,Rich Shapero,2004,Too Far
2,385504209,488,The Da Vinci Code,Dan Brown,2003,Doubleday
3,312195516,383,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA
4,60928336,320,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial


<b><h5>3. Books by same author, publisher of given book name

In [52]:
def printBook(k, n):
    z = k['Book-Title'].unique()
    for x in range(len(z)):
        print(z[x])
        if x >= n-1:
            break

In [53]:
def get_books(dataframe, name, n):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = dataset1[dataset1['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

    print("\n\nBooks by same Publisher:\n")
    au = dataframe['Publisher'].unique()

    if au[0] in list(data['Publisher'].unique()):
        k2 = pd.DataFrame(data[data['Publisher'] == au[0]])
    k2=k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

In [54]:
bookName = input("Enter the name of the book: ")
number = int(input("Enter the number of recommendations to display:"))


Enter the name of the book: Harry Potter and the Chamber of Secrets (Book 2)
Enter the number of recommendations to display:5


In [55]:
if bookName in list(dataset1['Book-Title'].unique()):
    d = dataset1[dataset1['Book-Title'] == bookName]
    get_books(d, bookName, number)  # Corrected parameter for 'get_books'
else:
    print("Invalid Book Name!!")



Books by same Author:

Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter y el cÃ¡liz de fuego
Harry Potter and the Order of the Phoenix (Book 5)
Harry Potter E la Camera Dei Segreti


Books by same Publisher:

Taking Chances (Heartland (Scholastic Paperback))
The Case of Hermie the Missing Hamster (A Jigsaw Jones Mystery, No 1)
The Sleeping Giant of Goll (Secrets of Droon, 6)
What Should I Do? (All About You)
Through the Hidden Door


# Algorithm Implementation

#### loading the downloaded clean dataset

In [59]:
import pandas as pd

# Loading the clean Dataset
dataset1 = pd.read_csv('/content/cleaned_dataset1.csv')
print(dataset1.shape)


(384074, 11)


In [61]:
# Remove duplicate books based on Book-Title and Book-Author columns
dataset1 = dataset1.drop_duplicates(subset=['Book-Title', 'Book-Author'])

In [62]:
dataset1.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Age,City,State,Country
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,35,timmins,ontario,canada
9,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,2954,8,71,wichita,kansas,usa
11,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,35704,6,53,kansas city,missouri,usa
17,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,11676,9,35,other,other,other
34,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,52614,8,33,toccoa,ga.,usa


In [63]:
dataset1.shape

(139667, 11)

### Reducing data(due to colab crashing )

In [64]:
# Randomly sample 10,000 rows
dataset = dataset1.sample(n=80000, random_state=42)

# 1.Content based Filtering

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def content_based_recommender(book_name, n, dataframe):
    # Combine the book title, author, and publisher
    dataframe['combined'] = dataframe['Book-Title'] + ' ' + dataframe['Book-Author'] + ' ' + dataframe['Publisher']


    # Initialize the TF-IDF
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(dataframe['combined'])

    if book_name not in dataframe['Book-Title'].values:
        return f"Book '{book_name}' not found in the dataset."

    idx = dataframe[dataframe['Book-Title'] == book_name].index[0]

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

    similar_books_idx = cosine_sim.argsort()[-n-1:-1][::-1]

    # Return the titles, authors, and publishers of the most similar books
    return dataframe.iloc[similar_books_idx][['Book-Title', 'Book-Author', 'Publisher']]

In [67]:
book_name = input("Enter a book name: ")
number = 5
Content_based_recommendations = content_based_recommender(book_name, number, dataset1)

# book name (Postmarked Yesteryear: 30 Rare Holiday Postcards)

print("Content-Based Recommendations:")
Content_based_recommendations

Enter a book name: Postmarked Yesteryear: 30 Rare Holiday Postcards
Content-Based Recommendations:


Unnamed: 0,Book-Title,Book-Author,Publisher
47679,Viking's Woman,Heather Graham,Dell Publishing Company
295538,Love Not a Rebel,Heather Graham,Dell Publishing Company
227334,Golden Surrender,Heather Graham,Dell Publishing Company
230578,One Wore Blue,Heather Graham,Dell Publishing Company
212902,And One Wore Gray,Heather Graham,Dell Publishing Company


In [70]:
# Randucing data for collabrative Algorithm(due crasing)
dataset2 = dataset1.sample(n=20000, random_state=42)

In [71]:
dataset2.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Age,City,State,Country,combined
213632,816733872,"Can I Have a Stegosaurus, Mom? Can I? Please!?",Lois G. Grambling,1998,Troll Communications,26593,10,35,peoria,arizona,usa,"Can I Have a Stegosaurus, Mom? Can I? Please!?..."
262134,425156249,"The Doctor, the Murder, the Mystery: The True ...",Barbara D'Amato,1997,Berkley Publishing Group,83287,10,45,tulsa,oklahoma,usa,"The Doctor, the Murder, the Mystery: The True ..."
368978,671691929,INTOXICATION: LIFE IN PURSUIT OF ARTIFICIAL PA...,Ronald Ph.D. Siegel,1990,Pocket,161222,7,33,rochester,new york,usa,INTOXICATION: LIFE IN PURSUIT OF ARTIFICIAL PA...
202305,805041451,Bone Black: Memories of Girlhood,bell hooks,1996,Henry Holt &amp; Company,202963,10,32,carrboro,north carolina,usa,Bone Black: Memories of Girlhood bell hooks He...
337980,760742332,The Supple Workout Hips and Thighs,Lorna Lee Malcom,2003,Duncan Baird Publishers,92804,5,45,brooklyn park,minnesota,usa,The Supple Workout Hips and Thighs Lorna Lee M...


# 2. Collabraive Filtering

In [76]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def collaborative_item_based_recommender(book_name, n, dataframe):
    # Check if the book_name exists in the dataset
    if book_name not in dataframe['Book-Title'].values:
        return f"Book '{book_name}' not found in the dataset."

    # Create a user-item matrix with User-ID
    user_book_matrix = dataframe.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating', fill_value=0)


    # Check if the book exists in the user-item matrix
    if book_name not in user_book_matrix.columns:
        return f"Book '{book_name}' not found in the collaborative matrix."

    # Compute the cosine similarity between items (books)
    item_similarity = cosine_similarity(user_book_matrix.T)

    # Convert the similarity matrix into a DataFrame for easier manipulation
    similarity_df = pd.DataFrame(item_similarity, index=user_book_matrix.columns, columns=user_book_matrix.columns)

    # Find the most similar books to the input book
    similar_books = similarity_df[book_name].sort_values(ascending=False)[1:n+1]

    return similar_books
    print(user_book_matrix)

In [77]:
book_name = input("Enter a book name: ")
number = 5  # Number of recommendations
Collaborative_recommendations = collaborative_item_based_recommender(book_name, number, dataset2)

print("Collaborative Filtering Recommendations:")
print(Collaborative_recommendations)
####Dino Crisis 2: Prima's Official Strategy Guide

Enter a book name: Dino Crisis 2: Prima's Official Strategy Guide
Collaborative Filtering Recommendations:
Book-Title
The Little Dinosaurs of Ghost Ranch                                                                             1.0
The Nemesis Affair: A Story of the Death of Dinosaurs and the Ways of Science                                   1.0
The Descent of Man, and Selection in Relation to Sex : (With a new introduction by J.T. Bonner and R.M. May)    1.0
Discovering Country Walks in North London (Discovering)                                                         1.0
How to Draw Walt Disney Pictures Presents Dinosaurs                                                             1.0
Name: Dino Crisis 2: Prima's Official Strategy Guide, dtype: float64


In [78]:
def hybrid_recommender(book_name, n, content_dataframe, collaborative_dataframe):
    # Get content-based recommendations
    try:
        content_recommendations = content_based_recommender(book_name, n, content_dataframe)
        if isinstance(content_recommendations, str):
            content_titles = []
        else:
            content_titles = content_recommendations['Book-Title'].tolist()
    except Exception as e:
        print(f"Error in content-based recommender: {e}")
        content_titles = []

    # Get collaborative recommendations
    try:
        collaborative_recommendations = collaborative_item_based_recommender(book_name, n, collaborative_dataframe)
        if isinstance(collaborative_recommendations, str):
            collaborative_titles = []
        else:
            collaborative_titles = collaborative_recommendations.index.tolist()
    except Exception as e:
        print(f"Error in collaborative recommender: {e}")
        collaborative_titles = []


    recommendations = list(set(content_titles).union(set(collaborative_titles)))

    # Limit the recommendations to the top `n`
    recommendations = recommendations[:n]

    # Format the output
    formatted_output = "\nHybrid Recommendations:\n"
    for i, book in enumerate(recommendations, start=1):
        formatted_output += f"{i}. {book}\n"

    return formatted_output


book_name = "Postmarked Yesteryear: 30 Rare Holiday Postcards"  # Replace with a book title in your dataset
number = 5
recommendations = hybrid_recommender(book_name, number, dataset1, dataset2)

# Print the hybrid recommendations
print(recommendations)



Hybrid Recommendations:
1. One Wore Blue
2. And One Wore Gray
3. Viking's Woman
4. Love Not a Rebel
5. Golden Surrender



# 4.Bloom Filter

In [81]:
import hashlib
import numpy as np

class BloomFilter:
    def __init__(self, size, hash_count):
        """
        Initialize the Bloom Filter.
        :param size: Size of the Bloom filter bit array.
        :param hash_count: Number of hash functions to use.
        """
        self.size = size
        self.hash_count = hash_count
        self.bit_array = np.zeros(size, dtype=bool)

    def _hashes(self, item):
        """
        Generate multiple hash values for the given item.
        :param item: The item to hash.
        :return: A list of hash indices.
        """
        hashes = []
        for i in range(self.hash_count):
            hash_result = hashlib.md5((str(item) + str(i)).encode()).hexdigest()
            hash_index = int(hash_result, 16) % self.size
            hashes.append(hash_index)
        return hashes

    def add(self, item):
        """
        Add an item to the Bloom Filter.
        :param item: The item to add.
        """
        for hash_index in self._hashes(item):
            self.bit_array[hash_index] = True

    def check(self, item):
        """
        Check if an item might be in the set.
        :param item: The item to check.
        :return: True if the item might be in the set, False otherwise.
        """
        return all(self.bit_array[hash_index] for hash_index in self._hashes(item))



def apply_bloom_filter(dataset, column_name):
    """
    Apply Bloom Filter on a dataset column.
    :param dataset: The dataset (DataFrame).
    :param column_name: The column to apply the Bloom Filter on.
    """

    bloom_filter = BloomFilter(size=1000, hash_count=5)

    # Add all items in the dataset to the Bloom Filter
    for item in dataset[column_name]:
        bloom_filter.add(item)


    book_1 = input("Enter the first book title to check: ")
    book_2 = input("Enter the second book title to check: ")

    # Check and print results for each book
    for book in [book_1, book_2]:
        is_present_in_bloom_filter = bloom_filter.check(book)

        # Check if the item is really in the dataset (for accuracy)
        is_present_in_dataset = book in dataset[column_name].values

        # Print results
        if is_present_in_bloom_filter:
            if is_present_in_dataset:
                print(f"Item '{book}' is in the set and exists in the dataset.")
            else:
                print(f"Item '{book}' is not in the set")
        else:
            print(f"Item '{book}' is not in the set.")

    return bloom_filter

apply_bloom_filter(dataset1, "Book-Title")



Enter the first book title to check: The Lovely Bones
Enter the second book title to check: thats great
Item 'The Lovely Bones' is in the set and exists in the dataset.
Item 'thats great' is not in the set


<__main__.BloomFilter at 0x7a5ce8d5e770>

#  5. LSH (Locality Sensitive Hashing)

In [85]:
#!pip install datasketch

In [86]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasketch import MinHash, MinHashLSH

# Reduce dataset to 10000 rows randomly
df = dataset1.sample(n=10000, random_state=42).reset_index(drop=True)
#print(df.head(12))

# Combine relevant fields into a single string for LSH (e.g., Title and Author)
df["Text"] = df["Book-Title"] + " " + df["Book-Author"]+" " + df["Publisher"]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Text"])

# Create MinHash objects for each book
minhashes = []
for i in range(tfidf_matrix.shape[0]):
    minhash = MinHash()
    for feature in tfidf_matrix[i].nonzero()[1]:
        minhash.update(str(feature).encode('utf8'))
    minhashes.append(minhash)

# Initialize MinHash LSH
lsh = MinHashLSH(threshold=0.1, num_perm=128)

# Add books to LSH index
for i, minhash in enumerate(minhashes):
    lsh.insert(f"Book-{i}", minhash)

# Query similar books for a given book (e.g., the 5th book)
query_index = 7
query_minhash = minhashes[query_index]
similar_books = lsh.query(query_minhash)

print(f"Top 5 books similar to '{df.iloc[query_index]['Book-Title']}' are:")
recommended_books = 0
for book_id in similar_books:
    if recommended_books >= 5:
        break
    book_index = int(book_id.split('-')[1])
    if book_index != query_index:  # Exclude the queried book itself
        print(f" - {df.iloc[book_index]['Book-Title']} by {df.iloc[book_index]['Book-Author']} and publisher is {df.iloc[book_index]['Publisher']}")
        recommended_books += 1


Top 5 books similar to 'School: The Story of American Public Education' are:
 - The Poetry of Cats by Samuel Carr and publisher is Borders Press
 - Cry of the Cat (Goosebumps Series 2000, No 1) by R. L. Stine and publisher is Apple (Scholastic)
 - French Leave by Sheri Cobb South and publisher is Prinny World Press
 - Poems Between Women by Emma Donoghue and publisher is Columbia University Press
 - The Matchmaker's Sister (Harlequin American Romance, 1010) by Karen Toller Whittenburg and publisher is Harlequin
