In [1]:
# autosave notebook after x seconds
%autosave 5



Autosaving every 5 seconds


In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

First install sckit-learn, if you do not have it. For this: 

```python
>>>conda config --add channels conda-forge
>>>conda install scikit-learn
```
Then go back and run the first code cell.



In [3]:
# load the diabetes dataset into a Pandas DataFrame
df_original = pd.read_csv("diabetes_NAN.csv")
df_original.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148,72,35,0,33.6,0.627,50,1
1,,85,66,29,0,26.6,0.351,31,0
2,8.0,183,64,0,0,100.0,0.672,32,1
3,1.0,89,66,23,94,28.1,0.167,21,0
4,0.0,137,40,35,168,43.1,2.288,33,1


In [4]:
df_original.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,767.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.848761,120.894531,69.105469,20.536458,79.799479,32.092448,0.471876,33.240885,0.348958
std,3.370207,31.972618,19.355807,15.952218,115.244002,8.25115,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,100.0,2.42,81.0,1.0


In [5]:
df = df_original

# find rows with missing values
missing_rows = df[df.isnull().any(axis=1)]

# drop rows with missing values
df = df.dropna()

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148,72,35,0,33.6,0.627,50,1
2,8.0,183,64,0,0,100.0,0.672,32,1
3,1.0,89,66,23,94,28.1,0.167,21,0
4,0.0,137,40,35,168,43.1,2.288,33,1
5,5.0,116,74,0,0,25.6,0.201,30,0


In [6]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.848761,120.94133,69.109518,20.525424,79.90352,32.099609,0.472034,33.243807,0.349413
std,3.370207,31.967149,19.368112,15.959694,115.283105,8.254146,0.331516,11.767627,0.477096
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.35,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.374,29.0,0.0
75%,6.0,140.5,80.0,32.0,127.5,36.6,0.6265,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,100.0,2.42,81.0,1.0


Alternatively, we can use the SimpleImputer class from scikit-learn to fill in the missing values with a specified value, such as the mean or median:

In [8]:
# create a SimpleImputer object
imputer = SimpleImputer(strategy="mean")

# fit the imputer to the data and transform the data to fill in missing values
df_imputed = imputer.fit_transform(df_original)

# convert the transformed data back into a Pandas DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df_original.columns)

In [9]:
# display the first five rows of the dataset (with removed NaNs via SimpleImputer)
df_imputed.head() 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,3.848761,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,0.0,0.0,100.0,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


Another important aspect of data cleaning is scaling the data. This is especially important for machine learning algorithms, as many of them are sensitive to the scale of the input data. We can use the StandardScaler class from scikit-learn to standardize the data:

In [10]:
# create a StandardScaler object
# check the docs: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#note that we do not work with df_imputed , but with the dataframe df with removed NANS 

scaler = StandardScaler()

# fit the scaler to the data and transform the data to scale it
df_scaled = scaler.fit_transform(df)

# convert the transformed data back into a Pandas DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

In [11]:
# display the first five rows of the original dataset 
df.head() 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148,72,35,0,33.6,0.627,50,1
2,8.0,183,64,0,0,100.0,0.672,32,1
3,1.0,89,66,23,94,28.1,0.167,21,0
4,0.0,137,40,35,168,43.1,2.288,33,1
5,5.0,116,74,0,0,25.6,0.201,30,0


In [12]:
# display the first five rows of the transformed dataset
df_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.638727,0.847005,0.149337,0.907538,-0.693559,0.181893,0.467752,1.424852,1.364529
1,1.232549,1.942593,-0.263983,-1.286918,-0.693559,8.231584,0.60358,-0.105766,1.364529
2,-0.845829,-0.999844,-0.160653,0.155153,0.122357,-0.484874,-0.920719,-1.041144,-0.732853
3,-1.142741,0.502677,-1.503942,0.907538,0.764674,1.333581,5.481338,-0.020732,1.364529
4,0.341816,-0.154676,0.252667,-1.286918,-0.693559,-0.787949,-0.818093,-0.275835,-0.732853
