In [2]:
import pandas as pd
import numpy as np

In [3]:
customer_details = pd.read_excel('grocery_database.xlsx', sheet_name = 'customer_details')
customer_details.head()

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
0,74,3.38,F,0.59
1,524,4.76,F,0.52
2,607,4.45,F,0.49
3,343,0.91,M,0.54
4,322,3.02,F,0.63


In [4]:
# count of missing and valid values
print(customer_details.isna().sum())
print('\n')
print(customer_details.notna().sum())

customer_id            0
distance_from_store    5
gender                 5
credit_score           8
dtype: int64


customer_id            870
distance_from_store    865
gender                 865
credit_score           862
dtype: int64


In [5]:
# return rows with missing values from specified column
customer_details[customer_details['distance_from_store'].isna()]

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
169,517,,,
314,105,,F,0.71
466,218,,M,0.37
576,362,,,0.56
845,292,,,


In [6]:
# return rows with valid values from specified column
customer_details[customer_details['distance_from_store'].notna()]

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
0,74,3.38,F,0.59
1,524,4.76,F,0.52
2,607,4.45,F,0.49
3,343,0.91,M,0.54
4,322,3.02,F,0.63
...,...,...,...,...
865,372,4.38,F,0.50
866,104,2.36,F,0.63
867,393,1.87,M,0.59
868,373,0.21,M,0.47


##### Dropping Rows with NA Values (Case Dependent)

In [7]:
# drop row where atleast one value is missing
customer_details.dropna(how = 'any')

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
0,74,3.38,F,0.59
1,524,4.76,F,0.52
2,607,4.45,F,0.49
3,343,0.91,M,0.54
4,322,3.02,F,0.63
...,...,...,...,...
865,372,4.38,F,0.50
866,104,2.36,F,0.63
867,393,1.87,M,0.59
868,373,0.21,M,0.47


In [8]:
# drop rows where all values in a row are missing
customer_details.dropna(how = 'all')

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
0,74,3.38,F,0.59
1,524,4.76,F,0.52
2,607,4.45,F,0.49
3,343,0.91,M,0.54
4,322,3.02,F,0.63
...,...,...,...,...
865,372,4.38,F,0.50
866,104,2.36,F,0.63
867,393,1.87,M,0.59
868,373,0.21,M,0.47


In [9]:
# drop row where a value is missing in a specified column(s)
customer_details.dropna(how = 'any', subset = ['distance_from_store', 'gender'])

Unnamed: 0,customer_id,distance_from_store,gender,credit_score
0,74,3.38,F,0.59
1,524,4.76,F,0.52
2,607,4.45,F,0.49
3,343,0.91,M,0.54
4,322,3.02,F,0.63
...,...,...,...,...
865,372,4.38,F,0.50
866,104,2.36,F,0.63
867,393,1.87,M,0.59
868,373,0.21,M,0.47


##### Filling Rows with NA Values (Case Dependent)

In [10]:
# create sample data
my_df = pd.DataFrame({
    'A': [1, 2, 4, np.nan, 5, np.nan, 7],
    'B': [4, np.nan, 7, np.nan, 1, np.nan, 2]
})
my_df

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,
2,4.0,7.0
3,,
4,5.0,1.0
5,,
6,7.0,2.0


In [11]:
# fill missing values in specified column
mean_a = my_df['A'].mean()
my_df['A'].fillna(value = mean_a) # use inplace = True to apply to df

0    1.0
1    2.0
2    4.0
3    3.8
4    5.0
5    3.8
6    7.0
Name: A, dtype: float64

In [12]:
# fill missing gender values in customer details with unknown
customer_details['gender'].fillna(value = 'U', inplace = True)
customer_details['gender'].value_counts()

F    485
M    380
U      5
Name: gender, dtype: int64

In [13]:
# view details distance_from_store column to find best way to handle na values
# mean is skewed due to outliers so it may be better to replace missing values with the median in this case
customer_details['distance_from_store'].describe()

count    865.000000
mean       2.614855
std       14.248286
min        0.000000
25%        0.740000
50%        1.660000
75%        2.940000
max      400.970000
Name: distance_from_store, dtype: float64

In [14]:
# replace missing distance_from_store values with the median
median_distance = customer_details['distance_from_store'].median()
customer_details['distance_from_store'].fillna(value = median_distance, inplace = True)
customer_details.isna().sum() # check column for missing values

customer_id            0
distance_from_store    0
gender                 0
credit_score           8
dtype: int64

In [15]:
# view details credit_score column to find best way to handle na values
# mean and median are very close so we could use either to replace na values
customer_details['credit_score'].describe()

count    862.000000
mean       0.597413
std        0.102716
min        0.260000
25%        0.530000
50%        0.590000
75%        0.670000
max        0.880000
Name: credit_score, dtype: float64

In [16]:
# replace missing credit_score values with the mean
mean_credit_score = customer_details['credit_score'].mean()
customer_details['credit_score'].fillna(value = mean_credit_score, inplace = True)
customer_details.isna().sum()

customer_id            0
distance_from_store    0
gender                 0
credit_score           0
dtype: int64