## Data Preprocessing and Cleaning 

In [41]:
import pandas as pd 
originalData = pd.read_csv('bookData.csv')

In [42]:
keepColumns = ["title", "description", "genres", "language"]
cleanedData = originalData[keepColumns].copy()
# obtain only books written in english 
cleanedData = cleanedData[cleanedData['language'] == 'English']
cleanedData = cleanedData.drop(columns='language')
cleanedData


Unnamed: 0,title,description,genres
0,The Hunger Games,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas..."
1,Harry Potter and the Order of the Phoenix,There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',..."
2,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ..."
3,Pride and Prejudice,Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance', 'Historical..."
4,Twilight,About three things I was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire..."
...,...,...,...
52473,Fractured,The Fateful Trilogy continues with Fractured. ...,"['Vampires', 'Paranormal', 'Young Adult', 'Rom..."
52474,Anasazi,"'Anasazi', sequel to 'The Thirteenth Chime' by...","['Mystery', 'Young Adult']"
52475,Marked,--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange..."
52476,Wayward Son,A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,"['Fiction', 'Mystery', 'Historical Fiction', '..."


### note to self would be good to make a dataset at the end with only each books top three genres

In [43]:
# removing librarians notes in the description 
import re 

#ensuring first that the description is a string 
cleanedData['description'] = cleanedData['description'].astype(str)

def removeLibNote(description):
    pattern = r"[Ll]ibrarian's note\s*:.+?\."
    return re.sub(pattern, '', description)

#apply to dataset
cleanedData['description'] = cleanedData['description'].apply(removeLibNote)

In [44]:
#removing ISBNs in the description 

def removeISBN(description):
    pattern = r"ISBN\s*\d+(?=[a-zA-Z])"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeISBN)

In [45]:
# removing things that say like "new york times bestseller"
def removeNYBest(description):
    pattern = r"(From the)? (#1\s)? New York Times bestselling (author)?"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeNYBest)


In [46]:
# removing 15 occurences of "Also see: Alternate Cover Editions for this ISBN [ACE]"
#pattern = r"((Also see:)|([Tt]his book has) Alternate Cover Editions for this ISBN [ACE])|(Alternative Cover Edition)"
#have to split into multiple cases lmao i got too confused with the long regexes 

def removeAlternate1(description):
    pattern = r"Also see: ([Tt]his book has)? [Aa]lternate [Cc]over [Ee]ditions for this ISBN"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate1)


In [47]:
#part 2 
def removeAlternate2(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Tt]his book has [Aa]lternate [Cc]over [Ee]ditions for this ISBN"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate2)

In [48]:
def removeAlternate3(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Ss]ee an alternate cover edition (here)?"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate3)

In [51]:
def removeAlternate4(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Aa]lternate [Cc]over [Ee]dition(:)?(ISBN)?(:)?\s*\d+(?=[a-zA-Z])"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate4)


In [None]:
def removeAlternate5(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"ACE"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate5)

In [52]:
#preliminary save to view 
cleanedData.to_csv('cleanedData.csv', index=True)