# Data Cleaning and Preprocessing #

## Loading the Dataset

In [1]:
# Importing Pandas library for loading the dataset
import pandas as pd
df = pd.read_csv('Car.csv')

# Printing initial values to check the correct loading of the dataset
df.head()

Unnamed: 0,OBJECTID,Bay_ID,Street,Suburb,Operator,Status,Date,Year,Month,Hours_Available,Hours_Booked,No_Bookings,Trip_Distance
0,1,17.0,Manning Street,Potts Point,GoGet,Installed,2019-10,2019,10.0,724.0,91.0,29.0,747.0
1,2,17.0,Manning Street,Potts Point,GoGet,Installed,2019-11,2019,11.0,719.0,148.0,36.0,948.0
2,3,17.0,Manning Street,Potts Point,GoGet,Installed,2019-12,2019,12.0,743.0,205.0,53.0,1167.0
3,4,18.0,Liverpool Street,Darlinghurst,GoGet,Installed,2019-01,2019,1.0,743.0,71.0,22.0,409.0
4,5,18.0,Liverpool Street,Darlinghurst,GoGet,Installed,2019-02,2019,2.0,668.0,68.0,22.0,445.0


In [3]:
# Printing terminal values to check the correct loading of the dataset
df.tail()

Unnamed: 0,OBJECTID,Bay_ID,Street,Suburb,Operator,Status,Date,Year,Month,Hours_Available,Hours_Booked,No_Bookings,Trip_Distance
39394,39395,17.0,Manning Street,Potts Point,GoGet,Installed,2019-05,2019,5.0,726.0,102.0,31.0,818.0
39395,39396,17.0,Manning Street,Potts Point,GoGet,Installed,2019-06,2019,6.0,720.0,165.0,48.0,1602.0
39396,39397,17.0,Manning Street,Potts Point,GoGet,Installed,2019-07,2019,7.0,744.0,118.0,37.0,1479.0
39397,39398,17.0,Manning Street,Potts Point,GoGet,Installed,2019-08,2019,8.0,742.0,160.0,38.0,1192.0
39398,39399,17.0,Manning Street,Potts Point,GoGet,Installed,2019-09,2019,9.0,684.0,116.0,21.0,842.0


## Identifying missing values

In [5]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])

Bay_ID               2
Status               1
Date                 2
Month                2
Hours_Available    644
Hours_Booked       615
No_Bookings        471
Trip_Distance      595
dtype: int64


## Handling missing values

In [9]:
# Replacing/ Filling NaN values with 0 for the entire Dataset
data_filled = df.fillna(0)

# Replacing/ Filling NaN values for numeric columns with the mean
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Replacing/ Filling NaN values for non-numeric columns with a placeholder like "Unknown"
non_numeric_columns = df.select_dtypes(exclude=['float64', 'int64']).columns
df[non_numeric_columns] = df[non_numeric_columns].fillna('Unknown')

# Dropping rows with NaN values
data_cleaned = df.dropna()

## Identifying Redundancy

In [12]:
# Checking for duplicate rows across the datset
duplicates = df.duplicated()

# Showcasing the number of duplicates across the dataset
print(f"Number of duplicate rows: {duplicates.sum()}")

# Eliminating redundancy across the dataset
data_cleaned = df.drop_duplicates()

Number of duplicate rows: 0


In [None]:
## Identifica