In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [23]:
diabetes_dedupped = diabetes.drop_duplicates()

In [24]:
print(f'Результирующее число записей: {diabetes_dedupped.shape[0]}')

Результирующее число записей: 768


In [25]:
diabetes = diabetes.drop_duplicates()

In [26]:
low_information_cols = [] 

#цикл по всем столбцам
for col in diabetes.columns:
    #наибольшая относительная частота в признаке
    top_freq = diabetes[col].value_counts(normalize=True).max()
    #доля уникальных значений от размера признака
    nunique_ratio = diabetes[col].nunique() / diabetes[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

Gender: 100.0% одинаковых значений


In [27]:
diabetes = diabetes.drop(low_information_cols, axis=1)

In [28]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.43,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.21,50,0


In [29]:
list_col = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI' ]
for col in list_col:
    diabetes[col]=diabetes[col].apply(lambda x: x if x!=0 else np.nan)

In [30]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,190.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,,35.7,0.148,21,0
2,2,108.0,64.0,,,30.8,0.158,21,0
3,8,107.0,80.0,,,24.6,0.856,34,0
4,7,136.0,90.0,,,29.9,0.21,50,0


In [31]:
display(diabetes['Insulin'].isnull().mean())

0.4869791666666667

In [35]:
thresh = diabetes.shape[0]*0.7
diabetes = diabetes.dropna(thresh=thresh, axis=1)
display(diabetes.shape[1])

8

In [36]:
thresh2 = diabetes.shape[1]-2
diabetes = diabetes.dropna(thresh = thresh2, axis = 0)
display(diabetes.shape[0])

761

In [37]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.21,50,0


In [38]:
values = {
    'Pregnancies': diabetes['Pregnancies'].mean(),
    'Glucose':diabetes['Glucose'].mean(),
    'BloodPressure':diabetes['BloodPressure'].mean(),
    'SkinThickness':	diabetes['SkinThickness'].mean(),
    'BMI':diabetes['BMI'].mean(),
    'DiabetesPedigreeFunction':diabetes['DiabetesPedigreeFunction'].mean(),
    'Age':diabetes['Age'].mean(),
    'Outcome':diabetes['Outcome'].mean()
    
}
diabetes = diabetes.fillna(values)

In [39]:
display(diabetes['SkinThickness'].mean())

29.153419593345657

In [40]:
def outliers_iqr_mod(data, feature, left = 1.5, right = 1.5):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [41]:
outliers, cleaned = outliers_iqr_mod(diabetes, 'SkinThickness')
display(outliers.shape[0])

87

In [42]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [43]:
outliers, cleaned = outliers_z_score(diabetes, 'SkinThickness')
display(outliers.shape[0])

4

In [44]:
def outliers_iqr_mod(data, feature, log_scale = False, left = 1.5, right = 1.5):
    if log_scale:
        x = np.log(data[feature])
    else : x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [45]:
outliers1, cleaned1 = outliers_iqr_mod(diabetes, 'DiabetesPedigreeFunction')
outliers2, cleaned2 = outliers_iqr_mod(diabetes, 'DiabetesPedigreeFunction', log_scale= True)
display(outliers1.shape[0]-outliers2.shape[0])

29