## Q. Build a recommender system by using cosine similarities score.

In [1]:
#Import required libraries
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [2]:
#read the file and data
data = pd.read_csv('book (1).csv',encoding= 'latin1', index_col=[0], names= ['User_ID','Book_Title','Book_Rating'],skiprows=1)
data.head()

Unnamed: 0,User_ID,Book_Title,Book_Rating
1,276726,Classical Mythology,5
2,276729,Clara Callan,3
3,276729,Decision in Normandy,6
4,276736,Flu: The Story of the Great Influenza Pandemic...,8
5,276737,The Mummies of Urumchi,6


In [3]:
#No of unique users
len(data.User_ID.unique())

2182

In [4]:
#No of unique book titles
len(data.Book_Title.unique())

9659

In [5]:
# Checking of any duplicate rows
data[data.duplicated()]

Unnamed: 0,User_ID,Book_Title,Book_Rating
5052,2152,Le nouveau soleil de Teur,7
7440,3757,The Magician's Tale,7


In [6]:
#Removing the duplicate rows
data = data.drop_duplicates().reset_index(drop= True)

In [7]:
#Creating a pivot table considering User_ID as indexx and Book_Titles as columns
user_books_data = data.pivot_table(index= 'User_ID',
                            columns= 'Book_Title',
                            values= 'Book_Rating').reset_index(drop=True)
user_books_data.head()

Book_Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [8]:
#updating the index using User_ID
user_books_data.index = data['User_ID'].unique()
user_books_data.head()

Book_Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
276726,,,,,,,,,,,...,,,,,,,,,,
276729,,,,,,,,,,,...,,,,,,,,,,
276736,,,,,,,,,,,...,,,,,,,,,,
276737,,,,,,,,,,,...,,,,,,,,,,
276744,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Imputing the NaN values with 0 
user_books_data.fillna(0,inplace=True)
user_books_data.head()

Book_Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
276726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### User based collaborative filtering

In [10]:
#Creating a similarity matrix based on the users
user_sim = 1-pairwise_distances(user_books_data.values,metric= 'cosine')

In [11]:
#User similarity matrix
user_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
#Creating a data frame out of the similarity matrix
user_sim_df = pd.DataFrame(user_sim)
user_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#Updating the index and column names using the user_IDs
user_sim_df.index = data.User_ID.unique()
user_sim_df.columns = data.User_ID.unique()

In [14]:
user_sim_df.head()

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Updating the diagonals with 0, as similarity with itself is maximum and we dont want it to be considered in our validations
np.fill_diagonal(user_sim,0)
user_sim_df.head()

NameError: name 'np' is not defined

In [None]:
#checking user who rated maximum in each column
user_sim_df.idxmax(axis=1).head()

In [None]:
# checking user_ID 276736 and 276726 similarity
data[(data.User_ID == 276736) |(data.User_ID == 276726)] 

In [None]:
# checking user_ID 276736 and 276726 similarity
data[(data.User_ID == 276729) |(data.User_ID == 276726)] 

In [None]:
#Checking user 276729 and rated books
user1=data[data.User_ID == 276729]
user1.Book_Title

In [None]:
#Checking user 276726 and rated books
user2 = data[data.User_ID == 276726]
user2.Book_Title

In [None]:
#Merging the both users into one table to compare better
pd.merge(user1,user2, on = 'Book_Title', how = 'outer')

## Conclusion:  (Based on user based collaborative filtring)

##### Based on the above table, if we need to suggest a book title to user 276729, We will suggest "Classical Mythology"
##### Similarly we will suggest "Decision in Normandy" book to user 276726 as it was rated high out of all the books by the user 276729.

## Item based collaboration filtering

In [16]:
#Taking a book ratings
Clara_Callan_rating = user_books_data['Clara Callan']
Clara_Callan_rating.head()

276726    0.0
276729    0.0
276736    0.0
276737    0.0
276744    0.0
Name: Clara Callan, dtype: float64

In [17]:
#Checking the correlation of the book ratings with the all other books
similar_to_Clara_Callan = user_books_data.corrwith(Clara_Callan_rating)
similar_to_Clara_Callan.head()

Book_Title
 Jason, Madison &amp                                                    -0.000459
 Other Stories;Merril;1985;McClelland &amp                              -0.000459
 Repairing PC Drives &amp                                               -0.000459
'48                                                                     -0.000459
'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities   -0.000459
dtype: float64

In [18]:
#Creating a correlation dataframe
Corr_Clara_Callan = pd.DataFrame(similar_to_Clara_Callan, columns= ['Correlation'])
Corr_Clara_Callan.dropna(inplace= True)
Corr_Clara_Callan.head()                                                                    

Unnamed: 0_level_0,Correlation
Book_Title,Unnamed: 1_level_1
"Jason, Madison &amp",-0.000459
Other Stories;Merril;1985;McClelland &amp,-0.000459
Repairing PC Drives &amp,-0.000459
'48,-0.000459
'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,-0.000459


In [19]:
#Sort the books based on the high correlation with the selected book
Corr_Clara_Callan[Corr_Clara_Callan['Correlation'] > 0].sort_values(by= 'Correlation',ascending = False)

Unnamed: 0_level_0,Correlation
Book_Title,Unnamed: 1_level_1
Clara Callan,1.0
Decision in Normandy,1.0


## Conclusion: (Based on Item based collaborative filtring)

##### If any user read and rated the book "Clara Callan", we can suggest "Decision in Normandy" book to that user.