In [1]:
#Importing necessary libraries
import pandas as pd


#Loading data into a dataframe
df = pd.read_csv('books.csv')
df.head(5) # quickly inspect first 5 rows of data

Unnamed: 0,title,price,stock,rating,link,category
0,A Light in the Attic,Â£51.77,22,Three,http://books.toscrape.com/catalogue/a-light-in...,Poetry
1,Tipping the Velvet,Â£53.74,20,One,http://books.toscrape.com/catalogue/tipping-th...,Historical Fiction
2,Soumission,Â£50.10,20,One,http://books.toscrape.com/catalogue/soumission...,Fiction
3,Sharp Objects,Â£47.82,20,Four,http://books.toscrape.com/catalogue/sharp-obje...,Mystery
4,Sapiens: A Brief History of Humankind,Â£54.23,20,Five,http://books.toscrape.com/catalogue/sapiens-a-...,History


In [2]:
df.info() #obtaining summary of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     1000 non-null   object
 1   price     1000 non-null   object
 2   stock     1000 non-null   int64 
 3   rating    1000 non-null   object
 4   link      1000 non-null   object
 5   category  1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [3]:
df.nunique() # understand unique and duplicated values across the columns

title        999
price        903
stock         21
rating         5
link        1000
category      50
dtype: int64

In [4]:
#further explore duplicated titles to decide whether to remove duplicates or not
duplicate_titles = df[df.duplicated(subset=["title"], keep=False)]
print(duplicate_titles)

                      title    price  stock rating  \
236  The Star-Touched Queen  Â£46.02     14   Five   
358  The Star-Touched Queen  Â£32.30     12   Five   

                                                  link category  
236  http://books.toscrape.com/catalogue/the-star-t...  Fantasy  
358  http://books.toscrape.com/catalogue/the-star-t...  Fantasy  


In [7]:
# remove irrelevant feilds for analysis 
df = df.drop(['link'], axis =1)
df.head(5)

Unnamed: 0,title,price,stock,rating,category
0,A Light in the Attic,Â£51.77,22,Three,Poetry
1,Tipping the Velvet,Â£53.74,20,One,Historical Fiction
2,Soumission,Â£50.10,20,One,Fiction
3,Sharp Objects,Â£47.82,20,Four,Mystery
4,Sapiens: A Brief History of Humankind,Â£54.23,20,Five,History


In [9]:
# Extract only digits and decimal points from the price values and coverts to float type
df["price"] = (df["price"].astype(str).str.extract(r'([\d.]+)').astype(float))
df.head(5)

Unnamed: 0,title,price,stock,rating,category
0,A Light in the Attic,51.77,22,Three,Poetry
1,Tipping the Velvet,53.74,20,One,Historical Fiction
2,Soumission,50.1,20,One,Fiction
3,Sharp Objects,47.82,20,Four,Mystery
4,Sapiens: A Brief History of Humankind,54.23,20,Five,History


In [10]:
#identify how many unique values available in rating
print(df["rating"].unique())

['Three' 'One' 'Four' 'Five' 'Two']


In [11]:
#rating conversion
df["rating"] = (
    df["rating"]
    .astype(str)
    .str.strip()
    .str.replace("One", "1")
    .str.replace("Two", "2") 
    .str.replace("Three", "3")
    .str.replace("Four", "4")
    .str.replace("Five", "5")
    .astype(int)
)
df.head(5)

Unnamed: 0,title,price,stock,rating,category
0,A Light in the Attic,51.77,22,3,Poetry
1,Tipping the Velvet,53.74,20,1,Historical Fiction
2,Soumission,50.1,20,1,Fiction
3,Sharp Objects,47.82,20,4,Mystery
4,Sapiens: A Brief History of Humankind,54.23,20,5,History


In [12]:
 # Title cleaning :Remove extra spaces, normalize text, strip special characters if present.
df["title"] = df["title"].str.strip()

In [13]:
#make sure stock values to be numeric
df["stock"] = pd.to_numeric(df["stock"], errors="coerce").astype("Int64")

In [14]:
#data validation
if df[["title", "price", "rating"]].isnull().any().any():
    raise ValueError("Missing essential values in dataset.")

In [15]:
df

Unnamed: 0,title,price,stock,rating,category
0,A Light in the Attic,51.77,22,3,Poetry
1,Tipping the Velvet,53.74,20,1,Historical Fiction
2,Soumission,50.10,20,1,Fiction
3,Sharp Objects,47.82,20,4,Mystery
4,Sapiens: A Brief History of Humankind,54.23,20,5,History
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,1,1,Classics
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,1,4,Sequential Art
997,A Spy's Devotion (The Regency Spies of London #1),16.97,1,5,Historical Fiction
998,1st to Die (Women's Murder Club #1),53.98,1,1,Mystery


In [16]:
#save cleaned data 
df.to_csv("books_clean.csv", index=False)