<p align="center">
  <img src="https://raw.githubusercontent.com/pachecowillians/svg-icons/d8778eb9218e8548afced046fd447ca4aaa5ff32/img/wine.svg" alt="Python" width="100px">
</p>

In this notebook, I delve into my first experience with classification algorithms, exploring the field of machine learning. Through the study, I aim to gain insights into how these algorithms work and their applications, analyzing datasets and evaluating their performance. Join me on this exciting journey as we unlock the potential of machine learning and classification techniques.

In [14]:
# Importing the pandas library
import pandas as pd

In [15]:
# Loading the Wine Dataset
ds = pd.read_csv('dataset/diabetes_prediction_dataset.csv')

# Displaying the dataset
ds.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [16]:
# Get the distinct values from the column
unique_values = ds['gender'].unique()

In [17]:
# Create a mapping dictionary for the distinct values
mapping_gender = {valor: index for index, valor in enumerate(unique_values)}

In [18]:
# Replace the values in the column with the corresponding number
ds['gender'] = ds['gender'].map(mapping_gender)
ds.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,never,25.19,6.6,140,0
1,0,54.0,0,0,No Info,27.32,6.6,80,0
2,1,28.0,0,0,never,27.32,5.7,158,0
3,0,36.0,0,0,current,23.45,5.0,155,0
4,1,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Get the distinct values from the column
unique_values = ds['smoking_history'].unique()

In [None]:
# Create a mapping dictionary for the distinct values
mapping_smoking = {valor: index for index, valor in enumerate(unique_values)}

In [None]:
# Replace the values in the column with the corresponding number
ds['smoking_history'] = ds['smoking_history'].map(mapping_smoking)
ds.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,never,25.19,6.6,140,0
1,0,54.0,0,0,No Info,27.32,6.6,80,0
2,1,28.0,0,0,never,27.32,5.7,158,0
3,0,36.0,0,0,current,23.45,5.0,155,0
4,1,76.0,1,1,current,20.14,4.8,155,0


In [20]:
# Splitting the dataset into goal (target variable) and predict (feature variables)
goal = ds['diabetes']
predict = ds.drop('diabetes', axis=1)

In [21]:
# Importing the train_test_split function
from sklearn.model_selection import train_test_split

In [22]:
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(predict, goal, test_size=0.3)

In [23]:
# Printing the shapes of the datasets
print(ds.shape, x_train.shape, x_test.shape)

(100000, 8) (70000, 7) (30000, 7)


In [24]:
# Importing the ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [25]:
# Creating an instance of the ExtraTreesClassifier model
model = ExtraTreesClassifier()

# Training the model
training = model.fit(x_train, y_train)

In [26]:
# Calculating the accuracy of the trained model on the test set
result = model.score(x_test, y_test)

# Printing the accuracy
print("Accuracy:", result)

Accuracy: 0.9652666666666667


In [27]:
# Importing the random module
import random

In [28]:
# Generating a random interval begin value
interval_begin = random.randint(0, len(x_test - 10))

In [29]:
# Selecting a subset of the test set
test = x_test[interval_begin : interval_begin + 10]

In [30]:
# Selecting the corresponding subset of target variable (y_test)
y_test[interval_begin : interval_begin + 10]

4602     0
54708    0
43359    0
69377    0
57662    0
35376    1
99913    0
26664    0
9039     0
68301    0
Name: diabetes, dtype: int64

In [31]:
# Making predictions on the selected subset of test data
predictions = model.predict(x_test[interval_begin : interval_begin + 10])

# Printing the predictions
print(predictions)

[0 0 0 0 0 1 0 0 0 0]


In [32]:
# Printing the comparison results
print(y_test[interval_begin : interval_begin + 10] == predictions)

4602     True
54708    True
43359    True
69377    True
57662    True
35376    True
99913    True
26664    True
9039     True
68301    True
Name: diabetes, dtype: bool
