In [2]:
import pandas as pd

df = pd.read_csv("books_cleaned.csv", encoding="utf-8")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6397 entries, 0 to 6396
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   isbn13              6397 non-null   int64  
 1   authors             6397 non-null   object 
 2   categories          6364 non-null   object 
 3   thumbnail           6190 non-null   object 
 4   description         6397 non-null   object 
 5   published_year      6397 non-null   float64
 6   average_rating      6397 non-null   float64
 7   num_pages           6397 non-null   float64
 8   ratings_count       6397 non-null   float64
 9   title_and_subtitle  6397 non-null   object 
 10  tagged_description  6397 non-null   object 
dtypes: float64(4), int64(1), object(6)
memory usage: 549.9+ KB


In [4]:
# too many categories
df["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2491
1,Juvenile Fiction,519
2,Biography & Autobiography,388
3,History,255
4,Literary Criticism,163
...,...,...
520,Humorous stories,1
521,Ballets,1
522,Aged women,1
523,Imperialism,1


In [5]:
df[df["description"].str.len() < 25 ]

Unnamed: 0,isbn13,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description


In [6]:
# let us change the categories to a more manageable number, fiction and non fiction with a zero shot classifier
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [7]:
fiction_categories = ["fiction","mystery","romance","scifi","fantasy","biography","history"]

df.head(10).apply(
    lambda x: classifier(x["description"], candidate_labels=fiction_categories, multi_label=True),
    axis=1,
    result_type="expand",
).rename(columns={"labels": "categories", "scores": "scores"})

Unnamed: 0,sequence,categories,scores
0,A NOVEL THAT READERS and critics have been eag...,"[fiction, history, biography, fantasy, mystery...","[0.8558421730995178, 0.6128803491592407, 0.296..."
1,A new 'Christie for Christmas' -- a full-lengt...,"[mystery, fiction, fantasy, scifi, biography, ...","[0.9339157342910767, 0.5139176249504089, 0.155..."
2,Volume Two of Stephen Donaldson's acclaimed se...,"[fiction, fantasy, history, scifi, biography, ...","[0.5638813972473145, 0.2660749554634094, 0.249..."
3,"A memorable, mesmerizing heroine Jennifer -- b...","[scifi, biography, fiction, history, romance, ...","[0.19755955040454865, 0.09938773512840271, 0.0..."
4,Lewis' work on the nature of love divides love...,"[mystery, romance, history, biography, scifi, ...","[0.16078977286815643, 0.06188512220978737, 0.0..."
5,"""In The Problem of Pain, C.S. Lewis, one of th...","[mystery, history, biography, scifi, romance, ...","[0.6848734021186829, 0.11091233044862747, 0.08..."
6,Until Vasco da Gama discovered the sea-route t...,"[history, mystery, biography, scifi, fiction, ...","[0.9738430976867676, 0.19697055220603943, 0.18..."
7,A new-cover reissue of the fourth book in the ...,"[scifi, fantasy, fiction, mystery, history, ro...","[0.9945376515388489, 0.9806752800941467, 0.934..."
8,Kate Blackwell is an enigma and one of the mos...,"[mystery, biography, fiction, scifi, history, ...","[0.9990025162696838, 0.43301281332969666, 0.04..."
9,One of Sidney Sheldon's most popular and bests...,"[romance, mystery, biography, fantasy, scifi, ...","[0.6518456935882568, 0.4315004348754883, 0.367..."


In [8]:
df.head()

Unnamed: 0,isbn13,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006163831,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,The One Tree,9780006163831 Volume Two of Stephen Donaldson'...
3,9780006178736,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
4,9780006280897,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...


In [None]:
from googlesearch import search

def fetch_first_google_link(query):
    results = search(query, num_results=1, lang="en")
    return list(results) if results else None




In [25]:
print(fetch_first_google_link("The One Tree by Stephen R. Donaldson -google books"))

['https://books.google.com/books/about/The_One_Tree.html?id=dXzwAAAAQBAJ&source=kp_cover']
