In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [2]:
diabetes.shape

(778, 10)

In [3]:
dupl_columns = list(diabetes.columns)

mask = diabetes.duplicated(subset=dupl_columns)
diabetes_duplicates = diabetes[mask]
print(f'Число найденных дубликатов: {diabetes_duplicates.shape[0]}')

Число найденных дубликатов: 10


In [4]:
diabetes = diabetes.drop_duplicates(subset=dupl_columns)
print(f'Результирующее число записей: {diabetes.shape[0]}')

Результирующее число записей: 768


In [5]:
# список неинформативных признаков
low_information_cols = []

# цикл по всем столбцам
for col in diabetes.columns:
    # наибольшая относительная частота в признаке
    top_freq = diabetes[col].value_counts(normalize=True).max()
    # доля уникальных значений от размера признака
    nunique_ratio = diabetes[col].nunique() / diabetes[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

Gender: 100.0% одинаковых значений


In [6]:
diabetes = diabetes.drop(low_information_cols, axis=1)
print(f'Результирующее число признаков: {diabetes.shape[1]}')

Результирующее число признаков: 9


In [7]:
diabetes.shape

(768, 9)

In [8]:
# Замените все записи, равные 0, в столбцах Glucose, BloodPressure, SkinThickness, Insulin и BMI на символ пропуска.
# Его вы можете взять из библиотеки numpy: np.nan
nan_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for nan_col in nan_cols:
    diabetes[nan_col] = diabetes[nan_col].replace(0, np.nan)

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,190.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,,35.7,0.148,21,0
2,2,108.0,64.0,,,30.8,0.158,21,0
3,8,107.0,80.0,,,24.6,0.856,34,0
4,7,136.0,90.0,,,29.9,0.21,50,0
5,6,103.0,72.0,32.0,190.0,37.7,0.324,55,0
6,1,71.0,48.0,18.0,76.0,20.4,0.323,22,0
7,0,117.0,,,,33.8,0.932,44,0
8,4,154.0,72.0,29.0,126.0,31.3,0.338,37,0
9,5,147.0,78.0,,,33.7,0.218,65,0


In [9]:
# Какая доля пропусков содержится в столбце Insulin? Ответ округлите до сотых.
is_null = diabetes['Insulin'].isnull()
print(round(is_null.sum() / diabetes.shape[0], 2))

0.49


In [10]:
# задаем минимальный порог: вычисляем 70% от числа строк
thresh = diabetes.shape[0] * 0.7
# удаляем столбцы, в которых более 30% пропусков
diabetes = diabetes.dropna(thresh=thresh, axis=1)
# удаляем записи, в которых есть хотя бы 1 пропуск
# diabetes = diabetes.dropna(how='any', axis=0)
# отображаем результирующую долю пропусков
diabetes.shape

(768, 8)

In [11]:
diabetes.head(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.21,50,0
5,6,103.0,72.0,32.0,37.7,0.324,55,0
6,1,71.0,48.0,18.0,20.4,0.323,22,0
7,0,117.0,,,33.8,0.932,44,0
8,4,154.0,72.0,29.0,31.3,0.338,37,0
9,5,147.0,78.0,,33.7,0.218,65,0


In [12]:
m = diabetes.shape[1]
print(m)
diabetes = diabetes.dropna(thresh=m-2, axis=0)
print(diabetes.shape[0])

8
761


In [13]:
values = {
    'Glucose': diabetes['Glucose'].median(),
    'BloodPressure': diabetes['BloodPressure'].median(),
    'SkinThickness': diabetes['SkinThickness'].median(),
    'BMI': diabetes['BMI'].median()
}
diabetes = diabetes.fillna(values)
diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.43,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,29.0,30.8,0.158,21,0
3,8,107.0,80.0,29.0,24.6,0.856,34,0
4,7,136.0,90.0,29.0,29.9,0.21,50,0
5,6,103.0,72.0,32.0,37.7,0.324,55,0
6,1,71.0,48.0,18.0,20.4,0.323,22,0
7,0,117.0,72.0,29.0,33.8,0.932,44,0
8,4,154.0,72.0,29.0,31.3,0.338,37,0
9,5,147.0,78.0,29.0,33.7,0.218,65,0


In [14]:
print(round(diabetes['SkinThickness'].mean(), 1))

29.1


In [15]:
def outliers_iqr(data, feature):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [16]:
outliers, cleaned = outliers_iqr(diabetes, 'SkinThickness')
print(outliers['SkinThickness'].shape[0])

87


In [17]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [18]:
outliers, cleaned = outliers_z_score(diabetes, 'SkinThickness', log_scale=False)
print(outliers['SkinThickness'].shape[0])

4


In [19]:
def outliers_iqr_mod_log(data, feature, left, right, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [20]:
outliers_not_log, cleaned = outliers_iqr(diabetes, 'DiabetesPedigreeFunction')
print(outliers_not_log['DiabetesPedigreeFunction'].shape[0])

outliers_log, cleaned = outliers_iqr_mod_log(diabetes, 'DiabetesPedigreeFunction',1.5, 1.5, True)
print(outliers_log['DiabetesPedigreeFunction'].shape[0])

29
0
