## Importing 

In [3]:
import pandas as pd
import numpy as np

df =  pd.read_csv('hotel_bookings.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## Finding Missing data and duplicates

In [None]:
print(df.isnull().sum())

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [7]:
print(df.isnull().any()) #which columns have null values

hotel                             False
is_canceled                       False
lead_time                         False
arrival_date_year                 False
arrival_date_month                False
arrival_date_week_number          False
arrival_date_day_of_month         False
stays_in_weekend_nights           False
stays_in_week_nights              False
adults                            False
children                           True
babies                            False
meal                              False
country                            True
market_segment                    False
distribution_channel              False
is_repeated_guest                 False
previous_cancellations            False
previous_bookings_not_canceled    False
reserved_room_type                False
assigned_room_type                False
booking_changes                   False
deposit_type                      False
agent                              True
company                            True


In [9]:
print(df.duplicated().sum())

31994


## cleaning up! 

In [18]:
df = df.fillna(0)

### filling with mode 

In [13]:
mode_value=df['country'].mode()[0]
df['country'] = df['country'].fillna(mode_value)

### for agent, we create a binary flag

In [14]:
df['has_agent'] = df['agent'].notna()

### since company is missing too many values (around 94.6%) we could just drop it entirely

In [15]:
df = df.drop('company', axis=1) # axis 1 means columns

### dropping duplicate entries

In [16]:
df.drop_duplicates(inplace=True) 

### verification

In [19]:
print("Missing values after cleaning:")
print(df.isnull().sum().sum())
print(f"Duplicates after cleaning: {df.duplicated().sum()}")

Missing values after cleaning:
0
Duplicates after cleaning: 0
