### Importing libraries

In [182]:
import pandas as pd
import numpy as np

### Loading dataset into Pandas dataframe

In [183]:
pd.set_option('display.max_rows', None)

In [184]:
data=pd.read_csv(r"C:\Users\cppra\Downloads\Projects\Data Cleaning\audible_data_uncleaned.csv")

In [185]:
data.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


In [186]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


In [187]:
data.describe()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
count,87489,87489,87489,87489,87489,87489,87489,87489.0
unique,82767,48374,29717,2284,5058,36,665,1011.0
top,The Art of War,"Writtenby:矢島雅弘,石橋遊",Narratedby:anonymous,2 mins,16-05-18,English,Not rated yet,586.0
freq,20,874,1034,372,773,61884,72417,5533.0


### Checking missing/null values

In [188]:
data.isna().sum()

name           0
author         0
narrator       0
time           0
releasedate    0
language       0
stars          0
price          0
dtype: int64

### Data Cleaning

In [189]:
data_copy=data.copy()

#### 1. Capitalizing column headers

In [190]:
data_copy.columns = data_copy.columns.str.capitalize()

#### 2. Removing unwanted content from 'Author' and 'Narrator' columns

In [191]:
data_copy['Author'] = data_copy['Author'].str.replace('Writtenby:', '', case=False)
data_copy['Narrator'] = data_copy['Narrator'].str.replace('Narratedby:', '', case=False)

#### 3. Separating first name and last name in 'Author' and 'Narrator' columns

In [192]:
data_copy['Author'] = data_copy['Author'].str.replace(r'(?<=[a-z])(?=[A-Z])', ' ')

  data_copy['Author'] = data_copy['Author'].str.replace(r'(?<=[a-z])(?=[A-Z])', ' ')


In [193]:
data_copy['Narrator'] = data_copy['Narrator'].str.replace(r'(?<=[a-z])(?=[A-Z])', ' ')

  data_copy['Narrator'] = data_copy['Narrator'].str.replace(r'(?<=[a-z])(?=[A-Z])', ' ')


In [194]:
data_copy.head()

Unnamed: 0,Name,Author,Narrator,Time,Releasedate,Language,Stars,Price
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Rick Riordan,Robbie Daymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Jeff Kinney,Dan Russell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


#### 4. Renaming the 'Releasedata' column for better readability and changing its datatypes


In [195]:
data_copy.rename(columns={'Releasedate': 'Release Date'}, inplace=True)

In [196]:
data_copy['Release Date']=data_copy['Release Date'].astype('datetime64[ns]')
data_copy['Language']=data_copy['Language'].astype('category')

#### 5. Capitalizing all the languages


In [197]:
data_copy['Language'] = data_copy['Language'].str.capitalize()

#### 6. Separating the star rating and the number of ratings from the 'Stars' column, forming 2 new columns 'Rating' and 'Number of ratings'

In [198]:
data_copy['Rating'] = data_copy['Stars'].str.extract(r'(\d+) out of 5 stars')
data_copy['Number of ratings'] = data_copy['Stars'].str.extract(r'(\d+) ratings')

data_copy['Rating'].replace('Not yet rated', pd.NA, inplace=True)

data_copy['Rating'] = pd.to_numeric(data_copy['Rating'], errors='coerce')

data_copy['Number of ratings'] = pd.to_numeric(data_copy['Number of ratings'], errors='coerce')

data_copy.rename(columns={'Stars': 'rating'}, inplace=True)
data_copy.drop(columns=['rating'],inplace=True)

#### 7. Removing commas and replacing the price of all 'Free' books

In [199]:
data_copy.loc[data_copy['Price'] == 'Free', 'Price'] = '0'

In [200]:
data_copy['Price'] = pd.to_numeric(data_copy['Price'].str.replace(',', ''), errors='coerce')

#### 8. Filling all null values in the 'Rating' and 'Number of ratings' column.

In [201]:
columns_to_fill = ['Rating', 'Number of ratings']
data_copy[columns_to_fill] = data_copy[columns_to_fill].fillna('0')

#### 9. Saving the dataframe into a new CSV file

In [202]:
data_copy.to_csv('audible_data_cleaned.csv', index=False)