# CleanOps Cleaning demo

A demonstration of data cleaning using the CleanOps toolkit, showcasing how to detect and fix duplicates, missing values, and outliers in a dataset.

# Import Modules

Import necessary libraries and classes (DataGetter, DataInspector, DataCleaner, DataExporter)

In [1]:
import pandas as pd
from cleanops import DataGetter, DataInspector, DataCleaner, DataOrganizer
from cleanops import DataExporter, ReportGenerator, DataOutput, DataPipeline

# Load Dataset
Load the CSV file using DataGetter

In [3]:
getter = DataGetter(r"..\datasets")
df = getter.read_csv("hotel_reservations_codeonly.csv")

In [5]:
df

Unnamed: 0,Reservation_ID,Guest_Name,Check_In_Date,Check_Out_Date,Room_Code,Number_of_Guests,Total_Amount,Payment_Method,Contact_Number,Booking_Status,Country
0,R0001,Guest_WJATQ,2025-09-11,2025-09-18,es401,15,4180.78,Debit Card,3544987648,No-Show,Canada
1,R0002,Guest_VHQJA,2025-01-17,2025-01-23,es302,2,4465.52,Bank Transfer,4484592881,Pending,UK
2,R0003,Guest_LUBCJ,2025-10-24,2025-10-29,es202,5,3522.26,Bank Transfer,4157989595,Pending,
3,R0004,Guest_YRIFB,2025-05-08,2025-05-13,st102,15,1410.00,Credit Card,7163149041,Confirmed,Italy
4,R0005,Guest_ASBHC,2025-09-23,2025-09-30,dl101,10,2594.27,Debit Card,7924531237,Confirmed,Spain
...,...,...,...,...,...,...,...,...,...,...,...
215,R0216,Guest_FTGKY,2025-02-19,2025-02-21,dl202,4,388.71,Debit Card,2191247059,Cancelled,Spain
216,R0217,Guest_MTVNH,2025-04-24,2025-04-30,es201,1,498.03,Bank Transfer,9797286494,Confirmed,UK
217,R0218,Guest_XHXND,2025-06-01,2025-06-09,db301,3,245.79,Credit Card,5271760054,Cancelled,USA
218,R0219,Guest_KGXPV,2025-04-22,2025-04-28,st301,4,560.44,Bank Transfer,7315241958,Cancelled,Brazil


# Detect Duplicate Records
Run DataInspector to find duplicates

In [7]:
ins = DataInspector(df)
ind = ins.detect_duplicates()

In [9]:
ind

{'Guest_Name': '16 duplicate values',
 'Check_In_Date': '60 duplicate values',
 'Check_Out_Date': '62 duplicate values',
 'Room_Code': '170 duplicate values',
 'Number_of_Guests': '213 duplicate values',
 'Total_Amount': '9 duplicate values',
 'Payment_Method': '215 duplicate values',
 'Booking_Status': '216 duplicate values',
 'Country': '209 duplicate values'}

# Clean Duplicates
Fix duplicate values using DataCleaner (Room_Code)

In [11]:
c = DataCleaner(df)
print(c.fix_duplicates("Room_Code"))

None


In [13]:
c._data.head()

Unnamed: 0,Reservation_ID,Guest_Name,Check_In_Date,Check_Out_Date,Room_Code,Number_of_Guests,Total_Amount,Payment_Method,Contact_Number,Booking_Status,Country
0,R0001,Guest_WJATQ,2025-09-11,2025-09-18,es401,15,4180.78,Debit Card,3544987648,No-Show,Canada
1,R0002,Guest_VHQJA,2025-01-17,2025-01-23,es302,2,4465.52,Bank Transfer,4484592881,Pending,UK
2,R0003,Guest_LUBCJ,2025-10-24,2025-10-29,es202,5,3522.26,Bank Transfer,4157989595,Pending,
3,R0004,Guest_YRIFB,2025-05-08,2025-05-13,st102,15,1410.0,Credit Card,7163149041,Confirmed,Italy
4,R0005,Guest_ASBHC,2025-09-23,2025-09-30,dl101,10,2594.27,Debit Card,7924531237,Confirmed,Spain


# Detect Missing Values
Run DataInspector to find missing values

In [15]:
ins = DataInspector(df)
inm = ins.detect_missing()

In [17]:
inm

Reservation_ID       0
Guest_Name          17
Check_In_Date        0
Check_Out_Date       0
Room_Code            0
Number_of_Guests     0
Total_Amount        10
Payment_Method       0
Contact_Number       0
Booking_Status       0
Country             23
dtype: int64

# Detect Outliers
Run DataInspector to detect outliers

In [19]:
ins = DataInspector(df)
ino = ins.detect_outliers()

In [21]:
ino

{'Number_of_Guests': 54, 'Total_Amount': 0, 'Contact_Number': 0}

# Export Cleaned Data
Export cleaned dataset to CSV using DataExporter

In [23]:
expo = DataExporter(c._data)
expo.to_csv()