In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from html import unescape

In [2]:
df = pd.read_csv("./data/book.txt", 
                   sep='\t', 
                   lineterminator='\n', 
                   header=None, 
                   names=["source", "isbn", "title", "authors"])

In [3]:
df.head()

Unnamed: 0,source,isbn,title,authors
0,eCampus.com,201853949,"The art Of Computer Programming, Fascicle 3: G...",Not Available\r
1,Indoo.com,201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E.\r"
2,textbookxdotcom,201853949,"The 'art Of Computer Programming, Fascicle 3 G...",\r
3,A1Books,201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E.\r"
4,textbooksNow,201853949,Art of Computer Programming,Knuth\r


In [4]:
df.describe()

Unnamed: 0,source,isbn,title,authors
count,33971,33971,33968,33971
unique,895,1265,11095,9627
top,A1Books,321263588,Modern Database Management,\r
freq,2403,159,90,713


# Preprocessing

In [5]:
def clean(df):
    # Unescape HTML left over from scraping
    df.authors = df.authors.apply(unescape)
    df.title = df.title.apply(lambda x: unescape(str(x)))
    # Lowercase 
    df["authors"] = df["authors"].str.lower()
    df["title"] = df["title"].str.lower()
    df["source"] = df["source"].str.lower()
    # Removes parenthesis from author list
    df["authors"] = df["authors"].str.replace('\((.*?)\)','')
    # Replaces | in author list with spaces
    df["authors"] = df["authors"].str.replace('\|',' ')
    # Special characters removal (removes all characters except )
    #df["title"] = df["title"].str.replace('[^\w\s]|\\r','')
    #df["authors"] = df["authors"].str.replace('[^\w\s]|\\r','')
    # Uniformize missing values
    df["authors"] = df["authors"].str.replace('not available','')
    df["authors"] = df["authors"].replace(r'^\s*$', np.nan, regex=True)
    # Remove doubled whitespaces
    df.authors = df.authors.replace('\s+', ' ', regex=True)
    df.title = df.title.replace('\s+', ' ', regex=True)
    

In [6]:
clean(df)

In [7]:
df.isnull().sum()

source       0
isbn         0
title        0
authors    743
dtype: int64

In [8]:
df.describe()

Unnamed: 0,source,isbn,title,authors
count,33971,33971,33971,33228
unique,894,1265,9255,8603
top,a1books,321263588,computer networking and the internet,"meyers, scott"
freq,2403,159,108,128


In [9]:
df.loc[(df.title.str.len() < 2)]

Unnamed: 0,source,isbn,title,authors
