In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import statistics as st
import warnings
warnings.filterwarnings("ignore")
sns.set(rc={"figure.figsize":(15,6)})
pd.pandas.set_option("display.max_columns",None)

In [2]:
data = pd.read_csv("https://gist.githubusercontent.com/jaidevd/23aef12e9bf56c618c41/raw/c05e98672b8d52fa0cb94aad80f75eb78342e5d4/books.csv")

In [3]:
data.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley
1,Data Smart,"Foreman, John",data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins
4,Orientalism,"Said, Edward",history,197,Penguin


In [4]:
data.tail()

Unnamed: 0,Title,Author,Genre,Height,Publisher
206,Structure and Randomness,"Tao, Terence",mathematics,252,
207,Image Processing with MATLAB,"Eddins, Steve",signal_processing,241,
208,Animal Farm,"Orwell, George",fiction,180,
209,"Idiot, The","Dostoevsky, Fyodor",fiction,197,
210,"Christmas Carol, A","Dickens, Charles",fiction,196,


In [5]:
data.shape

(211, 5)

In [6]:
data.duplicated().sum()

0

In [7]:
data.drop("Publisher",axis=1,inplace=True)

In [13]:
data["Author"] = data["Author"].fillna("Steinbeck, John ")

In [14]:
data["tags"] = data["Author"] + data["Genre"]

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   211 non-null    object
 1   Author  211 non-null    object
 2   Genre   211 non-null    object
 3   Height  211 non-null    int64 
 4   tags    211 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.4+ KB


In [16]:
data.isnull().sum()

Title     0
Author    0
Genre     0
Height    0
tags      0
dtype: int64

In [19]:
data["tags"] = data["tags"].apply(lambda x:x.lower())

In [20]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [24]:
data["tags"]= data["tags"].apply(stem)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english")

In [28]:
vactors = cv.fit_transform(data["tags"]).toarray()

In [31]:
vactors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
cv.get_feature_names()

['abrahamhistori',
 'ackroyd',
 'aczel',
 'adameconom',
 'adolfnonfict',
 'albertfict',
 'aldousfict',
 'alexhistori',
 'alfredeconom',
 'allendata_sci',
 'allenfict',
 'amartyaeconom',
 'amartyanonfict',
 'amartyaphilosophi',
 'amirsci',
 'amitavfict',
 'andrewcomputer_sci',
 'andycomputer_sci',
 'archer',
 'arthur',
 'aynfict',
 'aynphilosophi',
 'bach',
 'baz',
 'bbcnonfict',
 'bell',
 'bertrandphilosophi',
 'bobhistori',
 'bobnonfict',
 'bodanis',
 'bradsky',
 'braithwaitefict',
 'brown',
 'camus',
 'capra',
 'carlscienc',
 'cedricmathemat',
 'charlesfict',
 'comic',
 'conanfict',
 'conway',
 'corbett',
 'cormen',
 'crichton',
 'dalrymple',
 'danfict',
 'daviddata_sci',
 'davidsci',
 'dawkins',
 'deb',
 'deshpand',
 'deshpande',
 'devlin',
 'dickens',
 'dickinson',
 'dominiquefict',
 'dominiquehistori',
 'dostoevsky',
 'downey',
 'doyle',
 'drewdata_sci',
 'drucker',
 'dubner',
 'duda',
 'durant',
 'durrell',
 'dylan',
 'earl',
 'econom',
 'eddins',
 'edgar',
 'edwardhistori',
 'er

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(vactors)

In [54]:
similarities.shape

(211, 211)

In [59]:
similarities[1]

array([0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.

In [None]:
def recommend(book):
    book

In [52]:
data

Unnamed: 0,Title,Author,Genre,Height,tags
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,"goswami, jaidevasignal_process"
1,Data Smart,"Foreman, John",data_science,235,"foreman, johndata_sci"
2,God Created the Integers,"Hawking, Stephen",mathematics,197,"hawking, stephenmathemat"
3,Superfreakonomics,"Dubner, Stephen",economics,179,"dubner, stepheneconom"
4,Orientalism,"Said, Edward",history,197,"said, edwardhistori"
...,...,...,...,...,...
206,Structure and Randomness,"Tao, Terence",mathematics,252,"tao, terencemathemat"
207,Image Processing with MATLAB,"Eddins, Steve",signal_processing,241,"eddins, stevesignal_process"
208,Animal Farm,"Orwell, George",fiction,180,"orwell, georgefict"
209,"Idiot, The","Dostoevsky, Fyodor",fiction,197,"dostoevsky, fyodorfict"
