## Data Preprocessing and Cleaning 

In [16]:
import pandas as pd 
originalData = pd.read_csv('bookData.csv', converters={'genres': lambda x: x[1:-1].split(',')})

In [17]:
keepColumns = [ "description", "genres", "language"]
cleanedData = originalData[keepColumns].copy()
# obtain only books written in english 
cleanedData = cleanedData[cleanedData['language'] == 'English']
cleanedData = cleanedData.drop(columns='language')
cleanedData


Unnamed: 0,description,genres
0,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fan..."
1,There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magi..."
2,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction'..."
3,Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance', 'Histori..."
4,About three things I was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vamp..."
...,...,...
52473,The Fateful Trilogy continues with Fractured. ...,"['Vampires', 'Paranormal', 'Young Adult', '..."
52474,"'Anasazi', sequel to 'The Thirteenth Chime' by...","['Mystery', 'Young Adult']"
52475,--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,"['Fantasy', 'Young Adult', 'Paranormal', 'A..."
52476,A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,"['Fiction', 'Mystery', 'Historical Fiction',..."


In [18]:



for i in range(len(cleanedData['genres'])):
    cleanedData.iloc[i,1] = cleanedData.iloc[i,1][0:3]
    
    '''
    for label in cleanedData.iloc[i, 1]:
         cleanedData['genre1'] = cleanedData.iloc[i,1][0]
         cleanedData['genre2'] = cleanedData.iloc[i,1][1]
         cleanedData['genre3'] = cleanedData.iloc[i,1][2]
    '''

cleanedData.head()

Unnamed: 0,description,genres
0,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia']"
1,There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction']"
2,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction']"
3,Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance']"
4,About three things I was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance']"


In [19]:
cleanedData[['genre1','genre2', 'genre3']] = pd.DataFrame(cleanedData.genres.tolist(), index= cleanedData.index)
cleanedData = cleanedData.drop('genres', axis = 1)
cleanedData

Unnamed: 0,description,genre1,genre2,genre3
0,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,'Young Adult','Fiction','Dystopia'
1,There is a door at the end of a silent corrido...,'Fantasy','Young Adult','Fiction'
2,The unforgettable novel of a childhood in a sl...,'Classics','Fiction','Historical Fiction'
3,Alternate cover edition of ISBN 9780679783268S...,'Classics','Fiction','Romance'
4,About three things I was absolutely positive.\...,'Young Adult','Fantasy','Romance'
...,...,...,...,...
52473,The Fateful Trilogy continues with Fractured. ...,'Vampires','Paranormal','Young Adult'
52474,"'Anasazi', sequel to 'The Thirteenth Chime' by...",'Mystery','Young Adult',
52475,--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,'Fantasy','Young Adult','Paranormal'
52476,A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,'Fiction','Mystery','Historical Fiction'


In [20]:
# removing librarians notes in the description 
import re 

#ensuring first that the description is a string 
cleanedData['description'] = cleanedData['description'].astype(str)

def removeLibNote(description):
    pattern = r"[Ll]ibrarian's note\s*:.+?\."
    return re.sub(pattern, '', description)

#apply to dataset
cleanedData['description'] = cleanedData['description'].apply(removeLibNote)

In [21]:
#removing ISBNs in the description 

def removeISBN(description):
    pattern = r"ISBN\s*\d+(?=[a-zA-Z])"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeISBN)

In [22]:
# removing things that say like "new york times bestseller"
def removeNYBest(description):
    pattern = r"(From the)? (#1\s)? New York Times bestselling (author)?"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeNYBest)


In [23]:
# removing 15 occurences of "Also see: Alternate Cover Editions for this ISBN [ACE]"
#pattern = r"((Also see:)|([Tt]his book has) Alternate Cover Editions for this ISBN [ACE])|(Alternative Cover Edition)"
#have to split into multiple cases lmao i got too confused with the long regexes 

def removeAlternate1(description):
    pattern = r"Also see: ([Tt]his book has)? [Aa]lternate [Cc]over [Ee]ditions for this ISBN"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate1)


In [24]:
#part 2 
def removeAlternate2(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Tt]his book has [Aa]lternate [Cc]over [Ee]ditions for this ISBN"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate2)

In [25]:
def removeAlternate3(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Ss]ee an alternate cover edition (here)?"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate3)

In [26]:
def removeAlternate4(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"[Aa]lternate [Cc]over [Ee]dition(:)?(ISBN)?(:)?\s*\d+(?=[a-zA-Z])"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate4)


In [27]:
def removeAlternate5(description):
    #this pattern is not perfect by any means lmao, there are so many forms of it in the data 
    pattern = r"ACE"
    return re.sub(pattern, '', description)

cleanedData['description'] = cleanedData['description'].apply(removeAlternate5)

In [30]:
# dropping empty/missing values
print("Shape before dropping NaN values:", cleanedData.shape)
cleanedData.dropna()
print("Shape after dropping NaN values:", cleanedData.shape)

Shape before dropping NaN values: (42661, 4)
Shape after dropping NaN values: (42661, 4)


In [29]:
#preliminary save to view 
cleanedData.to_csv('cleanedData.csv', index=True)