# customer segmentation Data 

Prepare a customer dataset for clustering by handling missing values
and one-hot encoding categorical features.

Import library

In [1]:
import pandas as pd
import numpy as np

# Step 1: Load Data

In [2]:
# Create a DataFrame with some missing values
data = {
    'Customer_ID': [1, 2, 3, 4, 5],
    'Age': [25, 30, np.nan, 45, 50],
    'Customer_Type': ['New', 'Regular', 'New', np.nan, 'Regular']
}
df = pd.DataFrame(data)

print("Initial DataFrame:")
print(df)

Initial DataFrame:
   Customer_ID   Age Customer_Type
0            1  25.0           New
1            2  30.0       Regular
2            3   NaN           New
3            4  45.0           NaN
4            5  50.0       Regular


# Step 2: Impute Numerical Data

In [3]:
# Calculate the median of the 'Age' column
age_median = df['Age'].median()

# Fill missing 'Age' values with the median
df['Age'].fillna(age_median, inplace=True)

print("\nDataFrame after imputing 'Age' with median:")
print(df)



DataFrame after imputing 'Age' with median:
   Customer_ID   Age Customer_Type
0            1  25.0           New
1            2  30.0       Regular
2            3  37.5           New
3            4  45.0           NaN
4            5  50.0       Regular


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(age_median, inplace=True)


# Step 3: Handle Categorical Data

In [4]:
# Fill missing 'Customer_Type' values with 'Unknown'
df['Customer_Type'].fillna('Unknown', inplace=True)

# Perform one-hot encoding on 'Customer_Type'
df_encoded = pd.get_dummies(df, columns=['Customer_Type'], dtype=int)

print("\nDataFrame after handling 'Customer_Type' and one-hot encoding:")
print(df_encoded)



DataFrame after handling 'Customer_Type' and one-hot encoding:
   Customer_ID   Age  Customer_Type_New  Customer_Type_Regular  \
0            1  25.0                  1                      0   
1            2  30.0                  0                      1   
2            3  37.5                  1                      0   
3            4  45.0                  0                      0   
4            5  50.0                  0                      1   

   Customer_Type_Unknown  
0                      0  
1                      0  
2                      0  
3                      1  
4                      0  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Customer_Type'].fillna('Unknown', inplace=True)
