In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [3]:
diabetes.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
773,6,103,72,32,190,37.7,0.324,55,0,Female
774,1,71,48,18,76,20.4,0.323,22,0,Female
775,0,117,0,0,0,33.8,0.932,44,0,Female
776,4,154,72,29,126,31.3,0.338,37,0,Female
777,5,147,78,0,0,33.7,0.218,65,0,Female


In [9]:
diabetes_columns = list(diabetes.columns)
mask = diabetes.duplicated(subset=diabetes_columns)
diabetes_duplicates=diabetes[mask]

diabetes_dedupped = diabetes.drop_duplicates(subset=diabetes_columns)

diabetes_dedupped.shape[0]

768

In [11]:
low_information_cols = [] 
for col in diabetes_dedupped.columns:
    #наибольшая относительная частота в признаке
    top_freq = diabetes_dedupped[col].value_counts(normalize=True).max()
    #доля уникальных значений от размера признака
    nunique_ratio = diabetes_dedupped[col].nunique() / diabetes_dedupped[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

Gender: 100.0% одинаковых значений


In [13]:
diabetes_information = diabetes_dedupped.drop(low_information_cols,axis=1)
diabetes_information.shape[1]

9

In [27]:
list_of_collumns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI'] 

for col in list_of_collumns:
    diabetes_information[col] = diabetes_information[col].apply(lambda x: np.NaN if x == 0 else x) 
     
diabetes_isnul = diabetes_information.isnull().mean()
cols_with_null = diabetes_isnul[diabetes_isnul>0].sort_values(ascending=False)
display(round(cols_with_null,2))

Insulin          0.49
SkinThickness    0.30
BloodPressure    0.05
BMI              0.01
Glucose          0.01
dtype: float64

In [29]:


#задаем минимальный порог: вычисляем 70% от числа строк
thresh = diabetes_information.shape[0]*0.7
#удаляем столбцы, в которых более 30% (100-70) пропусков
diabetes_droped = diabetes_information.dropna(how='any', thresh=thresh, axis=1)

diabetes_droped.isnull().mean()

print(drop_data.shape)

(768, 8)


In [35]:
m = diabetes_droped.shape[1]
diabetes_droped = diabetes_droped.dropna(how='any',thresh=m-2, axis=0)
#отображаем результирующую долю пропусков
diabetes_droped.isnull().mean()

print(diabetes_droped.shape)

(761, 8)


In [37]:
display (diabetes_droped.head())



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.21,50,0


In [43]:
values = {
    'Pregnancies': diabetes_droped['Pregnancies'].median(),
    'Glucose': diabetes_droped['Glucose'].median(),
    'BloodPressure': diabetes_droped['BloodPressure'].median(),
    'SkinThickness': diabetes_droped['SkinThickness'].median(),  
    'BMI': diabetes_droped['BMI'].median(),
    'DiabetesPedigreeFunction': diabetes_droped['DiabetesPedigreeFunction'].median(),
    'Age': diabetes_droped['Age'].median(),
    'Outcome': diabetes_droped['Outcome'].median()
}

diabetes_filled = diabetes_droped.fillna(values)
round(diabetes_filled['SkinThickness'].mean(),1)

29.1

In [50]:
diabetes_filled

def outliers_iqr_mod(data, feature, left=1.5, right=1.5):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x<lower_bound) | (x> upper_bound)]
    cleaned = data[(x>lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers,cleaned = outliers_iqr_mod(diabetes_filled, 'SkinThickness')
outliers.shape

(87, 8)

In [55]:
def outliers_z_score_mod (data, feature, log_scale=False, left=3 ,right=3 ):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
     
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers,cleaned = outliers_z_score_mod(diabetes_filled, 'SkinThickness')
outliers.shape

(4, 8)

In [64]:
def outliers_iqr_mod(data, feature, log_scale=False, left=1.5 ,right=1.5 ):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
        
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x<lower_bound) | (x > upper_bound)]
    cleaned = data[(x>lower_bound) & (x < upper_bound)]
    return outliers, cleaned




outliers,cleaned = outliers_iqr_mod(diabetes_filled, 'DiabetesPedigreeFunction')
display(outliers.shape)

outliers,cleaned = outliers_iqr_mod(diabetes_filled, 'DiabetesPedigreeFunction',True)
outliers.shape


(29, 8)

(0, 8)