In [2]:
import pandas as pd
import os
 
# Load the CSV file
df = pd.read_csv('utilization.csv')
 
# Display the first few rows of the dataframe to understand its structure
print("Original Data:")
print(df.head())
 
# Clean the data
# Fill missing values with the mean, median, or mode
for column in df.columns:
    if df[column].dtype == 'object':
        # Fill missing values with the mode for categorical columns
        df[column] = df[column].fillna(df[column].mode()[0])
    else:
        # Fill missing values with the mean for numerical columns
        df[column] = df[column].fillna(df[column].mean())
 
# Remove duplicate rows
df_cleaned = df.drop_duplicates()
 
# Convert score_date to datetime format
df_cleaned['score_date'] = pd.to_datetime(df_cleaned['score_date'], format='%d-%m-%Y')
 
# Display the cleaned data
print("\nCleaned Data:")
print(df_cleaned.head())
 
# Ensure the directory exists
output_dir = 'files'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 
# Save the cleaned data to a new CSV file
output_path = os.path.join(output_dir, 'cleaned_utilization_eda.csv')
df_cleaned.to_csv(output_path, index=False)
 
print("Data cleaning completed successfully!")
 

Original Data:
              id  score_date  max_seats  agent_utilization  \
0   JMAN_2429072  10-02-2023        2.0           0.500000   
1   JMAN_9043466  13-01-2023        9.0           0.111111   
2   JMAN_2452556  24-02-2023        1.0           0.000000   
3   JMAN_2455113  03-03-2023        1.0           0.000000   
4  JMAN_18127937  10-02-2023        5.0           0.000000   

   is_provisioned_any_channel_M_before  max_seats_M_before  \
0                                  2.0                 2.0   
1                                  9.0                 9.0   
2                                  1.0                 1.0   
3                                  1.0                 1.0   
4                                  NaN                 5.0   

   agent_utilization_increase  agent_utilization_decrease  seat_utilization  \
0                           0                           0               1.0   
1                           0                           1               1.0   
2 