>Competition: https://www.kaggle.com/competitions/diabetes-classification

In [76]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

data.fillna(0, inplace=True)


In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   p_id                   614 non-null    int64  
 1   no_times_pregnant      614 non-null    int64  
 2   glucose_concentration  614 non-null    int64  
 3   blood_pressure         614 non-null    int64  
 4   skin_fold_thickness    614 non-null    int64  
 5   serum_insulin          614 non-null    int64  
 6   bmi                    614 non-null    float64
 7   diabetes pedigree      614 non-null    float64
 8   age                    614 non-null    int64  
 9   diabetes               614 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 48.1 KB


In [78]:
data.diabetes.value_counts()

diabetes
0    400
1    214
Name: count, dtype: int64

In [79]:
data.columns

Index(['p_id', 'no_times_pregnant', 'glucose_concentration', 'blood_pressure',
       'skin_fold_thickness', 'serum_insulin', 'bmi', 'diabetes pedigree',
       'age', 'diabetes'],
      dtype='object')

Feature Selection

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
data.drop('p_id', axis=1, inplace=True)

X = data.drop('diabetes', axis=1)
y = data['diabetes']

classifier = RandomForestClassifier(n_estimators=1000, random_state=7, n_jobs=-1)
classifier.fit(X, y)

print(f"Importance:\n{data.columns}\n{classifier.feature_importances_}\n")

sfm = SelectFromModel(classifier, threshold=0.1)
sfm.fit(X, y)

X_important = sfm.transform(X)

feature_names = X.columns[sfm.get_support()]

print('Selected features:')
print(feature_names)

Importance:
Index(['no_times_pregnant', 'glucose_concentration', 'blood_pressure',
       'skin_fold_thickness', 'serum_insulin', 'bmi', 'diabetes pedigree',
       'age', 'diabetes'],
      dtype='object')
[0.0826375  0.27092473 0.08921212 0.06663935 0.06824126 0.16278637
 0.12889934 0.13065933]

Selected features:
Index(['glucose_concentration', 'bmi', 'diabetes pedigree', 'age'], dtype='object')


classification(Random Forest) and check accuracy using train-test split

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_important, y, test_size=0.2, random_state=7)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

kf = KFold(n_splits=10, random_state=7, shuffle=True)
scores = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Accuracy for each fold: {scores}")
print(f"average accuracy: {scores.mean()}")

Accuracy: 0.7804878048780488
Accuracy for each fold: [0.74       0.81632653 0.67346939 0.7755102  0.71428571 0.81632653
 0.75510204 0.67346939 0.71428571 0.73469388]
average accuracy: 0.7413469387755103


Descison Tree Classifier

In [82]:
from sklearn.tree import DecisionTreeClassifier

d_classifier = DecisionTreeClassifier(random_state=7, )
d_classifier.fit(X_train, y_train)
y_pred = d_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

k_scores = cross_val_score(d_classifier, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Accuracy for each fold: {k_scores}")
print(f"average accuracy: {k_scores.mean()}")

Accuracy: 0.6910569105691057
Accuracy for each fold: [0.7        0.79591837 0.67346939 0.75510204 0.63265306 0.7755102
 0.75510204 0.67346939 0.63265306 0.6122449 ]
average accuracy: 0.7006122448979591


KNN

In [83]:
from sklearn.neighbors import KNeighborsClassifier

k_classifier = KNeighborsClassifier(n_neighbors=9)

k_classifier.fit(X_train, y_train)

y_pred = k_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

k_scores = cross_val_score(k_classifier, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Accuracy for each fold: {k_scores}")
print(f"average accuracy: {k_scores.mean()}")

Accuracy: 0.8292682926829268
Accuracy for each fold: [0.78       0.75510204 0.57142857 0.71428571 0.75510204 0.79591837
 0.67346939 0.73469388 0.69387755 0.67346939]
average accuracy: 0.714734693877551


Gradient Boost

In [84]:
from sklearn.ensemble import GradientBoostingClassifier

g_classifier = GradientBoostingClassifier(n_estimators=180, learning_rate=0.01, max_depth=1, random_state=7)
g_classifier.fit(X_train, y_train)

y_pred = g_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

k_scores = cross_val_score(g_classifier, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Accuracy for each fold: {k_scores}")
print(f"average accuracy: {k_scores.mean()}")

Accuracy: 0.7804878048780488
Accuracy for each fold: [0.7        0.81632653 0.65306122 0.7755102  0.7755102  0.85714286
 0.71428571 0.73469388 0.65306122 0.71428571]
average accuracy: 0.7393877551020408


generate result.csv

In [85]:
test_data = test[feature_names]

predictions = g_classifier.predict(test_data)

result = pd.DataFrame({
    'p_id': test['p_id'].values,
    'diabetes': predictions
})

result.to_csv('result.csv', index=False)

