In [3]:
import pandas as pd
import matplotlib.pyplot as plt

### Dataset Description

In [4]:

#reading the customer data
customer_data = pd.read_csv("data/Train.csv")

In [13]:
#viewing the data frame
customer_data.head(5)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [7]:
# showing the general structure of dataframe
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [9]:
# show the dimensions
print(f'Number of Rows: {customer_data.shape[0]}')
print(f'Number of Columns: {customer_data.shape[1]}')

Number of Rows: 8068
Number of Columns: 11


In [10]:
customer_data.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

### Data Cleaning

In [59]:
# Seeing if there exist null values in the columns and listing if there are null values
customer_data.isnull().sum()

ID                   0
Gender               0
Ever_Married         0
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [55]:
# The dataset is quite small, so we try to impute the missing values instead of removing them
# Since the age at which person get married are rather similar in general, context. We should impute ever_married with mode after grouping with age
mode_of_ever_married_by_age = customer_data.groupby("Age")['Ever_Married'].apply(lambda x:x.mode().iloc[0])

In [57]:
# replace the missing ever_married data by the mode
customer_data['Ever_Married'] = customer_data['Ever_Married'].fillna(customer_data['Age'].map(mode_of_ever_married_by_age))

In [63]:
# imputing graduated
# checking correlation between age and graduated
correlation_between_age_graduated = customer_data['Age'].corr( customer_data['Graduated'].map({'Yes': 1, 'No': 0}))
print(f'{correlation_between_age_graduated}')

0.23730934258442968


In [64]:
# since the correlation is small, there only weak correlation between age and graduation.
# so, we will just impute graduated with its mode
customer_data['Graduated'].fillna(customer_data['Graduated'].mode()[0], inplace=True)

In [66]:
# imputing profession
customer_data['Profession'].fillna(customer_data['Profession'].mode()[0], inplace=True)

In [67]:
# imputing work_experience by median
customer_data['Work_Experience'].fillna(customer_data['Work_Experience'].median(), inplace =True)

In [70]:
# imputing family size
customer_data['Family_Size'].fillna(customer_data['Family_Size'].median(), inplace =True)

In [71]:
customer_data['Var_1'].fillna(customer_data['Var_1'].mode()[0], inplace =True)

In [72]:
# From this it can be seen all the data has been imputed, and there dont exist any null values
customer_data.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64