# Recommendation System

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns


In [27]:

content_ratings = pd.read_csv('Ratings.csv')
content_users = pd.read_csv('Users.csv')
content_books = pd.read_csv('Books.csv')

  content_books = pd.read_csv('Books.csv')


In [28]:
content_books.head()


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [29]:
print("Null values in books data: \n", content_books.isnull().sum())


Null values in books data: 
 ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


In [30]:
content_books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'Year-Of-Publication'], axis=1, inplace=True)

content_users.drop(['Location', 'Age'], axis=1, inplace=True)


In [31]:
content1 = pd.merge(content_ratings, content_books, on='ISBN')
result = pd.merge(content1, content_users, on='User-ID')

In [32]:
result.shape

(1031136, 6)

In [33]:
result['Features'] = result['Book-Title'] + ', ' + result['Book-Author'] + ', ' + result['Publisher']

In [34]:
result.head()


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Publisher,Features
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,Ballantine Books,"Flesh Tones: A Novel, M. J. Rose, Ballantine B..."
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,Ballantine Books,"Flesh Tones: A Novel, M. J. Rose, Ballantine B..."
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,Tor Books,"Ender's Game (Ender Wiggins Saga (Paperback)),..."
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,Vintage,"In Cold Blood (Vintage International), TRUMAN ..."
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,HarperCollins,Divine Secrets of the Ya-Ya Sisterhood : A Nov...


In [35]:
result['Publisher'].nunique()

16729

In [36]:
publisher_ratings = result.groupby('Publisher')['Book-Rating'].count().reset_index()

publisher_ratings = publisher_ratings.rename(columns={'Book-Rating': 'Total Ratings Count'})

top_publishers = publisher_ratings.nlargest(10, 'Total Ratings Count')

print(top_publishers)


                      Publisher  Total Ratings Count
1388           Ballantine Books                34724
11710                    Pocket                31989
1714   Berkley Publishing Group                28614
15861              Warner Books                25506
6557                  Harlequin                25027
1418               Bantam Books                23598
1414                     Bantam                20007
13594               Signet Book                19155
1246                       Avon                17352
11339             Penguin Books                17033


In [37]:
result.drop(['Book-Author', 'Publisher', 'User-ID', 'Book-Rating'], axis=1, inplace=True)


In [38]:
result.isnull().sum()


ISBN          0
Book-Title    0
Features      3
dtype: int64

In [39]:
result.dropna(subset=['Features'], inplace=True)

In [40]:
result['Features'] = result['Features'].apply(lambda x : str(x).lower())
result

Unnamed: 0,ISBN,Book-Title,Features
0,034545104X,Flesh Tones: A Novel,"flesh tones: a novel, m. j. rose, ballantine b..."
1,034545104X,Flesh Tones: A Novel,"flesh tones: a novel, m. j. rose, ballantine b..."
2,0812533550,Ender's Game (Ender Wiggins Saga (Paperback)),"ender's game (ender wiggins saga (paperback)),..."
3,0679745580,In Cold Blood (Vintage International),"in cold blood (vintage international), truman ..."
4,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,divine secrets of the ya-ya sisterhood : a nov...
...,...,...,...
1031131,2862749796,Le Huit,"le huit, katherine neville, le cherche midi"
1031132,3788097000,Ludwig Marum: Briefe aus dem Konzentrationslag...,ludwig marum: briefe aus dem konzentrationslag...
1031133,0553571001,Christmas With Anne and Other Holiday Stories:...,christmas with anne and other holiday stories:...
1031134,0689822294,Heaven (Coretta Scott King Author Award Winner),heaven (coretta scott king author award winner...


In [41]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [42]:
def stem(text):
    arr = []
    for i in text.split():
        arr.append(ps.stem(i))
    return ' '.join(arr)

In [43]:
result['Features'].apply(stem)


0           flesh tones: a novel, m. j. rose, ballantin book
1           flesh tones: a novel, m. j. rose, ballantin book
2          ender' game (ender wiggin saga (paperback)), o...
3          in cold blood (vintag international), truman c...
4          divin secret of the ya-ya sisterhood : a novel...
                                 ...                        
1031131            le huit, katherin neville, le cherch midi
1031132    ludwig marum: brief au dem konzentrationslag k...
1031133    christma with ann and other holiday stories: a...
1031134    heaven (coretta scott king author award winner...
1031135    robot race (micro adv 6), david antoni kroft, ...
Name: Features, Length: 1031133, dtype: object

In [44]:
result.head()

Unnamed: 0,ISBN,Book-Title,Features
0,034545104X,Flesh Tones: A Novel,"flesh tones: a novel, m. j. rose, ballantine b..."
1,034545104X,Flesh Tones: A Novel,"flesh tones: a novel, m. j. rose, ballantine b..."
2,0812533550,Ender's Game (Ender Wiggins Saga (Paperback)),"ender's game (ender wiggins saga (paperback)),..."
3,0679745580,In Cold Blood (Vintage International),"in cold blood (vintage international), truman ..."
4,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,divine secrets of the ya-ya sisterhood : a nov...


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

content_filtered = result[result['Book-Title'] == 'To Share a Sunset']
sample_size = 1000

additional_sample_size = sample_size - len(content_filtered)
content_additional_sampled = result[~result.index.isin(content_filtered.index)].sample(n=additional_sample_size, random_state=42)

content_sampled = pd.concat([content_filtered, content_additional_sampled])


tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_sampled = tfidf_vectorizer.fit_transform(content_sampled['Features'])

from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  
import numpy as np

def calculate_cosine_similarity_with_progress(tfidf_matrix):
    cosine_sim = np.zeros((tfidf_matrix.shape[0], tfidf_matrix.shape[0]))

    for i in tqdm(range(tfidf_matrix.shape[0]), desc="Calculating Cosine Similarity"):
        for j in range(tfidf_matrix.shape[0]):
            cosine_sim[i][j] = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0, 0]
    return cosine_sim

cosine_sim_sampled = calculate_cosine_similarity_with_progress(tfidf_matrix_sampled)




Calculating Cosine Similarity: 100%|██████████| 1000/1000 [05:14<00:00,  3.18it/s]


In [46]:
cosine_sim_df = pd.DataFrame(cosine_sim_sampled, index=content_sampled.index, columns=content_sampled.index)


In [48]:
from tabulate import tabulate
def recommend_book(book_title):
    book_indices = content_sampled[content_sampled['Book-Title'] == book_title].index
    print(f"Book indices: {book_indices}")
    
    if len(book_indices) == 0:
        print(f"Book with title '{book_title}' not found in sampled data.")
        return
    
    index_in_sampled = book_indices[0]
    
    print(f"Length of content_sampled: {len(content_sampled)}")
    print(f"Length of cosine_sim_sampled: {len(cosine_sim_sampled)}")
    print(f"Index in sampled data: {index_in_sampled}")
    print(f"Type of index in sampled data: {type(index_in_sampled)}")
    
    if index_in_sampled not in content_sampled.index:
        print("Error: Invalid index in sampled data.")
        return
    
    print("Unique indices in content_sampled:")
    print(content_sampled.index.unique())
    
    index_in_cosine_sim = content_sampled.index.get_loc(index_in_sampled)
    
    if index_in_cosine_sim >= len(cosine_sim_sampled) or index_in_cosine_sim < 0:
        print("Error: Invalid index for cosine similarity matrix.")
        return
    
    cosine_row = cosine_sim_df.loc[index_in_sampled]
    similar_books = sorted(enumerate(cosine_row), key=lambda x: x[1], reverse=True)[1:6]
    
    table_data = []
    for i in similar_books:
        recommended_index = i[0]
        recommended_title = content_sampled['Book-Title'].iloc[recommended_index]
        similarity_score = i[1]
        table_data.append([recommended_index, recommended_title, similarity_score])
    
    print(tabulate(table_data, headers=['Index', 'Recommended Book', 'Similarity Score'], tablefmt='grid'))

recommend_book('To Share a Sunset')


Book indices: Int64Index([186745], dtype='int64')
Length of content_sampled: 1000
Length of cosine_sim_sampled: 1000
Index in sampled data: 186745
Type of index in sampled data: <class 'numpy.int64'>
Unique indices in content_sampled:
Int64Index([186745, 320093, 237247, 504863,  82685,  46503, 245208, 235909,
            990022, 516874,
            ...
            593632, 341517, 917082, 758697, 319516, 919056, 237896, 474686,
            138351, 121752],
           dtype='int64', length=1000)
+---------+--------------------------------------------+--------------------+
|   Index | Recommended Book                           |   Similarity Score |
|     945 | Night Games (Love Spell Timeswept Romance) |           0.316604 |
+---------+--------------------------------------------+--------------------+
|     882 | The Love Potion (Time of Your Life)        |           0.276641 |
+---------+--------------------------------------------+--------------------+
|     589 | Snoops in the City   

In [49]:
content_sampled

Unnamed: 0,ISBN,Book-Title,Features
186745,0505520036,To Share a Sunset,"to share a sunset, sharice kendyl, love spell"
320093,0380564998,Jacob Have I Loved,"jacob have i loved, katherine paterson, avon b..."
237247,0684177730,The SKULL BENEATH THE SKIN,"the skull beneath the skin, p. d. james, scribner"
504863,2020580314,Panne de sens,"panne de sens, mouss benia, seuil"
82685,0515130044,The Attorney,"the attorney, steve martini, jove books"
...,...,...,...
919056,0671695304,"FOREVER : A Novel of Good and Evil, Love and Hope","forever : a novel of good and evil, love and h..."
237896,0440209986,Not Exactly a Brahmin,"not exactly a brahmin, susan dunlap, dell"
474686,0684857820,Lucky,"lucky, alice sebold, scribner"
138351,0533141435,The Other End of the Log: Memoirs of an Educat...,the other end of the log: memoirs of an educat...


In [50]:
recommend_book('The Testament')


Book indices: Int64Index([473686, 121752], dtype='int64')
Length of content_sampled: 1000
Length of cosine_sim_sampled: 1000
Index in sampled data: 473686
Type of index in sampled data: <class 'numpy.int64'>
Unique indices in content_sampled:
Int64Index([186745, 320093, 237247, 504863,  82685,  46503, 245208, 235909,
            990022, 516874,
            ...
            593632, 341517, 917082, 758697, 319516, 919056, 237896, 474686,
            138351, 121752],
           dtype='int64', length=1000)
+---------+--------------------+--------------------+
|   Index | Recommended Book   |   Similarity Score |
|     999 | The Testament      |           1        |
+---------+--------------------+--------------------+
|     717 | The King of Torts  |           0.519166 |
+---------+--------------------+--------------------+
|     180 | The Summons        |           0.500976 |
+---------+--------------------+--------------------+
|     578 | The Chamber        |           0.500976 |
+------

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf_vectorizer.fit_transform(content_sampled['Features'])

num_topics = 5 
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

lda.fit(tfidf_matrix)

# Get the top words for each topic
feature_names = tfidf_vectorizer.get_feature_names_out()
num_top_words = 10 
topic_words = {}
for topic, comp in enumerate(lda.components_):
    word_idx = np.argsort(comp)[::-1][:num_top_words]
    topic_words[topic] = [feature_names[i] for i in word_idx]

# Print the top words for each topic
for topic, words in topic_words.items():
    print(f"Topic {topic + 1}: {' '.join(words)}")


Topic 1: books john grisham dell publishing bantam group harlequin book paperback
Topic 2: books amp penguin avon little usa paperback cat novel press
Topic 3: books st book publishing dean star martin group press novel
Topic 4: books book mary bantam ballantine life warner publishing le love
Topic 5: books warner mira press avon night steel danielle dell little


# Using K-Means

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Feature extraction (TF-IDF on combined features)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(content_sampled['Features'])

# Clustering
num_clusters = 10  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

content_sampled['Cluster'] = kmeans.labels_

# Example: Recommend items from the same cluster as a given item
def recommend_from_cluster(book_title):
    cluster_label = content_sampled.loc[content_sampled['Book-Title'] == book_title, 'Cluster'].iloc[0]
    recommended_books = content_sampled.loc[content_sampled['Cluster'] == cluster_label, 'Book-Title'].tolist()
    return recommended_books




In [56]:
recommend_from_cluster('The Testament')


['Fine Things',
 'Time Flies',
 'Dreaming the Eagle (Scott, Manda. Boudica Trilogy.)',
 "Perfect Evil (Maggie O'Dell Novels (Paperback))",
 'Vanished',
 'The Gift',
 'Outlander',
 'Circle of Friends',
 'Mean High Tide',
 'Ramona the Brave (Ramona Quimby (Paperback))',
 'The Summons',
 'A Painted House',
 'The Street Lawyer',
 'The Firm',
 'Invasion of Privacy',
 'Roots',
 'The Horse Whisperer',
 'The Beautiful Stranger (The Rogues of Regent Street)',
 'Space Search',
 'Salad Herbs (Library of Culinary Arts)',
 'Sunset in St. Tropez',
 'Joshua and the City',
 'The Chamber',
 'The Story of Sacajawea, Guide to Lewis and Clark (Dell Yearling Biography)',
 'Accident',
 'One Summer',
 'Summer Sisters',
 "Sleeping Murder: Miss Marple's Last Case",
 'The Whisper of the Axe: A Novel',
 'CRY IN THE NIGHT, A',
 'Firefox Down',
 'The King of Torts',
 'Once in a Lifetime',
 'Deep Cover: The Inside Story of How Dea Infighting, Incompetence, and Subterfuge Lost Us the Biggest Battle of the Drug War',