In [1]:
import pandas as pd

In [3]:
data_url = "https://dr3vr6j2erh62.cloudfront.net/mediastore/dsm020/datasets/pima_indians-diabetes_data-raw.csv"
df = pd.read_csv(data_url)

In [5]:
df.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


We see there are 
- Numerical variables (counts, scale values, floating values)
- Categorical values

In [6]:
df.describe() # Quick summary description

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [32]:
# let's explore data types used to represent our data
df.dtypes

times_pregnant                    int64
plasma_glucose_concentration      int64
diastolic_blood_pressure          int64
triceps_thickness                 int64
2-hour_serum_insulin              int64
BMI                             float64
diabetes_pedigreen              float64
age                               int64
diabetes                          int64
dtype: object

In [9]:
# Some models downstream may prefer numercial values, lets chage diabetes bool to an integer
# reassign diabetes column to itself, casted as int
df["diabetes"] = df["diabetes"].astype(int)

In [10]:
df.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Handling Missing Values and Outliers

In [33]:
# make a deep copy
df2 = df.copy()

In [34]:
import numpy as np
#Replace 0 values in cols 1-7 with Nan
for col in df2.columns[1:7]:
    df2[col] = df2[col].replace(0, np.nan)
df2.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [35]:
df2.dtypes

times_pregnant                    int64
plasma_glucose_concentration    float64
diastolic_blood_pressure        float64
triceps_thickness               float64
2-hour_serum_insulin            float64
BMI                             float64
diabetes_pedigreen              float64
age                               int64
diabetes                          int64
dtype: object

Note that the dtypes for the columns are now represented using floating point object which increases the amount of memory consumption from our code

In [16]:
# Alternative: Use in-place replacement on entire dataframe

df3 = df.copy()

df3.replace(0, np.nan, inplace=True)

In [17]:
df3.dtypes

times_pregnant                  float64
plasma_glucose_concentration    float64
diastolic_blood_pressure        float64
triceps_thickness               float64
2-hour_serum_insulin            float64
BMI                             float64
diabetes_pedigreen              float64
age                               int64
diabetes                        float64
dtype: object

In [19]:
df3.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,
2,8.0,183.0,64.0,,,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0


In [36]:
df2.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [40]:
# Outlier due to non conformity hurt the generalization

#Outlier notion: Naive implementation using relative distance (absolute distance) from the mean. 
#We may decide that anything +/- 2 standard deviations away from the mean should be considered an outlier

m = np.mean(df2["plasma_glucose_concentration"])
m

121.6867627785059

In [41]:
sd = np.std(df2["plasma_glucose_concentration"])
sd

30.515624262345657

In [42]:
df3 = df2.copy()
# lets choose 2.5 standard deviations 

# What to do once we find it? Replace with mean, Nan, Imputation (robust, model learnt)
m = np.mean(df2["plasma_glucose_concentration"])
sd = np.std(df2["plasma_glucose_concentration"])

threshold = (sd*2.5)
#Replace with mean
df3.loc[df3["plasma_glucose_concentration"] > (m + threshold), 'plasma_glucose_concentration'] = m
df3.loc[df3["plasma_glucose_concentration"] < (m - threshold), 'plasma_glucose_concentration'] = m


In [43]:
df3.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.587235,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.149764,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,56.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.5,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,197.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


boolean collection that when overlaid on another (col) (or vectorf) object of same size, will only allow values with corresponding true in mask to be selected 

In [28]:
# More advanced but robust method for outlier handling is to use masks (boolean operators which operate on true) 

df4 = df2.copy()
m = np.mean(df2["plasma_glucose_concentration"])
sd = np.std(df2["plasma_glucose_concentration"])
threshold = (sd*2.5)

In [45]:
#construct mask 
mask = (df4["plasma_glucose_concentration"] > (m+threshold)) | \
(df4["plasma_glucose_concentration"] < (m-threshold))

In [46]:
df4.loc[mask, 'plasma_glucose_concentration' ] = m #replace with m if you find cells with true in mask

In [47]:
df4.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.587235,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.149764,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,56.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.5,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,197.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [49]:
# Working with missing data (NaN, Null)

In [85]:
df = pd.DataFrame(np.random.randn(5,5))
df

Unnamed: 0,0,1,2,3,4
0,0.91778,-0.105589,-0.766267,-0.067883,0.54516
1,0.099319,0.512009,0.54202,-0.255811,-0.215394
2,-1.905376,-0.990187,-0.466139,-2.102063,-1.034921
3,1.27991,-0.118049,-0.225738,-0.214176,-1.374727
4,-1.081416,-0.968084,-0.783406,0.177861,-0.07795


In [56]:
df.iloc[:2,2] # choose specific iloc locations

0    0.981099
1    0.813138
Name: 2, dtype: float64

In [87]:
df.iloc[:]

Unnamed: 0,0,1,2,3,4
0,,-0.105589,-0.766267,-0.067883,
1,0.099319,,,-0.255811,-0.215394
2,-1.905376,-0.990187,-0.466139,-2.102063,-1.034921
3,1.27991,-0.118049,-0.225738,-0.214176,-1.374727
4,-1.081416,-0.968084,-0.783406,0.177861,-0.07795


In [86]:
# use those locations to define masks, or replacement rules
df[df.iloc[:2]>=0.5] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,,-0.105589,-0.766267,-0.067883,
1,0.099319,,,-0.255811,-0.215394
2,-1.905376,-0.990187,-0.466139,-2.102063,-1.034921
3,1.27991,-0.118049,-0.225738,-0.214176,-1.374727
4,-1.081416,-0.968084,-0.783406,0.177861,-0.07795


In [88]:
df

Unnamed: 0,0,1,2,3,4
0,,-0.105589,-0.766267,-0.067883,
1,0.099319,,,-0.255811,-0.215394
2,-1.905376,-0.990187,-0.466139,-2.102063,-1.034921
3,1.27991,-0.118049,-0.225738,-0.214176,-1.374727
4,-1.081416,-0.968084,-0.783406,0.177861,-0.07795


In [95]:
# Work on whole dataset
df[df.iloc[:]>=-0.990187] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,,,,,
1,,,,,
2,-1.905376,-0.990187,,-2.102063,-1.034921
3,,,,,-1.374727
4,-1.081416,,,,


In [94]:
print(df[1].isnull().sum()) # how many null values in xth col

1


In [69]:
df

Unnamed: 0,0,1,2,3,4
0,0.930181,0.804702,-0.294919,-0.176581,-0.334856
1,-0.704964,,-1.193574,0.26526,-1.426323
2,0.636026,0.172577,,-0.047063,-0.456545
3,-0.800867,0.040146,0.425497,-2.370833,0.183121
4,0.717917,0.123275,0.694856,-1.922997,-0.92113


In [70]:
#replace all Nan with mean of column

df.fillna(df.mean(), inplace=True) 
df

Unnamed: 0,0,1,2,3,4
0,0.930181,0.804702,-0.294919,-0.176581,-0.334856
1,-0.704964,0.285175,-1.193574,0.26526,-1.426323
2,0.636026,0.172577,-0.092035,-0.047063,-0.456545
3,-0.800867,0.040146,0.425497,-2.370833,0.183121
4,0.717917,0.123275,0.694856,-1.922997,-0.92113


get df in np array representation for advanced techniques

In [96]:
values = df.values
values

array([[        nan,         nan,         nan,         nan,         nan],
       [        nan,         nan,         nan,         nan,         nan],
       [-1.90537616, -0.99018704,         nan, -2.10206339, -1.03492132],
       [        nan,         nan,         nan,         nan, -1.37472707],
       [-1.08141632,         nan,         nan,         nan,         nan]])

In [97]:
#advanced technique - tools (imputers, knn)
from numpy import isnan
from numpy import nan
from sklearn.impute import SimpleImputer # replaces values in place


imputer = SimpleImputer(missing_values=nan, strategy="most_frequent") # or mean, or advanced techniques in ML
transformed_values = imputer.fit_transform(values)
transformed_values

array([[-1.90537616, -0.99018704, -2.10206339, -1.37472707],
       [-1.90537616, -0.99018704, -2.10206339, -1.37472707],
       [-1.90537616, -0.99018704, -2.10206339, -1.03492132],
       [-1.90537616, -0.99018704, -2.10206339, -1.37472707],
       [-1.08141632, -0.99018704, -2.10206339, -1.37472707]])