### Early stage diabetes risk prediction dataset.

In [1]:
import pandas as pd
import numpy as np

random_state = 20

df = pd.read_csv('diabetes_data_upload.csv')
df.head(3)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive


### Feature Engineering

In [2]:
# Preprocess Y
df.replace('Positive', 1, inplace=True)
df.replace('Negative', 0, inplace=True)

### Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [4]:
df.nunique()

Age                   51
Gender                 2
Polyuria               2
Polydipsia             2
sudden weight loss     2
weakness               2
Polyphagia             2
Genital thrush         2
visual blurring        2
Itching                2
Irritability           2
delayed healing        2
partial paresis        2
muscle stiffness       2
Alopecia               2
Obesity                2
class                  2
dtype: int64

In [5]:
target = 'class'
features = df.drop(columns=[target]).columns.tolist()

target, features

('class',
 ['Age',
  'Gender',
  'Polyuria',
  'Polydipsia',
  'sudden weight loss',
  'weakness',
  'Polyphagia',
  'Genital thrush',
  'visual blurring',
  'Itching',
  'Irritability',
  'delayed healing',
  'partial paresis',
  'muscle stiffness',
  'Alopecia',
  'Obesity'])

In [6]:
df[target].value_counts()

1    320
0    200
Name: class, dtype: int64

### Pipeline

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class TruthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, zero, one):
        self.zero = zero
        self.one = one

    def fit(self, X, *_):
        return self

    def transform(self, X, *_):
        X.replace(self.one, 1, inplace=True)
        X.replace(self.zero, 0, inplace=True)
        return X

In [8]:
def columns_with_word(df, words):
    columns = df[df.isin(words)].any()
    columns = columns[columns == True]
    return columns.index.to_list()

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

truth_transformer = TruthTransformer(zero='No', one='Yes')
truth_columns = columns_with_word(df, ['Yes', 'No'])

gender_transformer = TruthTransformer(zero='Female', one='Male')
gender_columns = columns_with_word(df, ['Female', 'Male'])

preprocessor = ColumnTransformer(
    transformers=[
        ('truth', truth_transformer, truth_columns),
        ('gender', gender_transformer, gender_columns)
    ])

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=random_state)

In [11]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

### Train

In [12]:
from sklearn.model_selection import train_test_split

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=random_state)

In [13]:
pipeline.fit(X_train, y_train)
y_predict = pipeline.predict(X_test)

### Evaluate

In [14]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

In [15]:
evaluate_results(y_test, y_predict)

Classification results:
f1: 97.80%
roc: 97.21%
recall: 97.37%
precision: 98.23%


### Сохранение

In [21]:
import dill

with open("source/model/pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)