<a href="https://colab.research.google.com/github/meisam210905/Sani/blob/main/pima_indians_diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing required libraries**

In [1]:
import numpy as np
import pandas as pd

**Importing data from Google Drive**

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
data = pd.read_csv('/content/gdrive/My Drive/Datasets/pima-indians-diabetes.csv')

**Head of data**

In [4]:
data.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


**Importing names of column**

In [5]:
data = pd.read_csv('/content/gdrive/My Drive/Datasets/pima-indians-diabetes.csv', names = ['preg', 'gluc', 'pres', 'skin', 'test', 'bmi', 'pedi', 'age', 'class'])
data.head()

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


**Look for missing data**

In [6]:
data.isnull().sum()

preg     0
gluc     0
pres     0
skin     0
test     0
bmi      0
pedi     0
age      0
class    0
dtype: int64

**Look for NaN data**

In [7]:
data.isna().sum()

preg     0
gluc     0
pres     0
skin     0
test     0
bmi      0
pedi     0
age      0
class    0
dtype: int64

# **But with some accuracy, we encounter data, such as skin thickness, whose value cannot be zero.**

In [8]:
skines = data[data['skin'] == 0]
skines

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
2,8,183,64,0,0,23.3,0.672,32,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0
9,8,125,96,0,0,0.0,0.232,54,1
10,4,110,92,0,0,37.6,0.191,30,0
...,...,...,...,...,...,...,...,...,...
757,0,123,72,0,0,36.3,0.258,52,1
758,1,106,76,0,0,37.5,0.197,26,0
759,6,190,92,0,0,35.5,0.278,66,1
762,9,89,62,0,0,22.5,0.142,33,0


In [9]:
data['skin'].mean()

20.536458333333332

**We replace the data value of the skin column that has zero value with the Mean.**

**Since the zero data is also calculated when acquiring the Mean value, we recalculate the Mean value without taking the zero data into calculate.**

In [10]:
data[data['skin'] != 0]['skin'].mean()

29.153419593345657

 ""20.536458333333332""
"The value with considering zero data"

**Replace the mean value of skin with zero data.**

In [11]:
skin_mean = data[data['skin'] != 0]['skin'].mean()
data.replace({'skin' : 0}, skin_mean, inplace=True)

In [12]:
data

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
0,6,148,72,35.00000,0,33.6,0.627,50,1
1,1,85,66,29.00000,0,26.6,0.351,31,0
2,8,183,64,29.15342,0,23.3,0.672,32,1
3,1,89,66,23.00000,94,28.1,0.167,21,0
4,0,137,40,35.00000,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48.00000,180,32.9,0.171,63,0
764,2,122,70,27.00000,0,36.8,0.340,27,0
765,5,121,72,23.00000,112,26.2,0.245,30,0
766,1,126,60,29.15342,0,30.1,0.349,47,1


**Similarly, we perform on the test column.**

In [13]:
test_mean = data[data['test'] != 0]['test'].mean()
test_mean

155.5482233502538

In [14]:
data.replace({'test' : 0}, test_mean, inplace=True)
data

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
0,6,148,72,35.00000,155.548223,33.6,0.627,50,1
1,1,85,66,29.00000,155.548223,26.6,0.351,31,0
2,8,183,64,29.15342,155.548223,23.3,0.672,32,1
3,1,89,66,23.00000,94.000000,28.1,0.167,21,0
4,0,137,40,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48.00000,180.000000,32.9,0.171,63,0
764,2,122,70,27.00000,155.548223,36.8,0.340,27,0
765,5,121,72,23.00000,112.000000,26.2,0.245,30,0
766,1,126,60,29.15342,155.548223,30.1,0.349,47,1


**Now we'll use the "K nearest neighbour" method to value the skin and test columns, which has zero values based on Euclidean space distance and with rows whose values are not zero.**

In [15]:
data = pd.read_csv('/content/gdrive/My Drive/Datasets/pima-indians-diabetes.csv', names = ['preg', 'gluc', 'pres', 'skin', 'test', 'bmi', 'pedi', 'age', 'class'])
data.head()

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


**Now we should change 0 values to Np.NAN to use the "K nearest neighbor"**

In [16]:
data.replace({'test' : 0, 'skin' : 0}, np.nan, inplace=True)

In [17]:
data

Unnamed: 0,preg,gluc,pres,skin,test,bmi,pedi,age,class
0,6,148,72,35.0,,33.6,0.627,50,1
1,1,85,66,29.0,,26.6,0.351,31,0
2,8,183,64,,,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48.0,180.0,32.9,0.171,63,0
764,2,122,70,27.0,,36.8,0.340,27,0
765,5,121,72,23.0,112.0,26.2,0.245,30,0
766,1,126,60,,,30.1,0.349,47,1


In [18]:
from sklearn.impute import KNNImputer

In [19]:
imputer = KNNImputer(n_neighbors=3)
data = imputer.fit_transform(data)
data

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [20]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [21]:
print(data)

[[6.00000000e+00 1.48000000e+02 7.20000000e+01 3.50000000e+01
  1.25333333e+02 3.36000000e+01 6.27000000e-01 5.00000000e+01
  1.00000000e+00]
 [1.00000000e+00 8.50000000e+01 6.60000000e+01 2.90000000e+01
  6.66666667e+01 2.66000000e+01 3.51000000e-01 3.10000000e+01
  0.00000000e+00]
 [8.00000000e+00 1.83000000e+02 6.40000000e+01 3.00000000e+01
  1.95000000e+02 2.33000000e+01 6.72000000e-01 3.20000000e+01
  1.00000000e+00]
 [1.00000000e+00 8.90000000e+01 6.60000000e+01 2.30000000e+01
  9.40000000e+01 2.81000000e+01 1.67000000e-01 2.10000000e+01
  0.00000000e+00]
 [0.00000000e+00 1.37000000e+02 4.00000000e+01 3.50000000e+01
  1.68000000e+02 4.31000000e+01 2.28800000e+00 3.30000000e+01
  1.00000000e+00]
 [5.00000000e+00 1.16000000e+02 7.40000000e+01 1.83333333e+01
  1.09000000e+02 2.56000000e+01 2.01000000e-01 3.00000000e+01
  0.00000000e+00]
 [3.00000000e+00 7.80000000e+01 5.00000000e+01 3.20000000e+01
  8.80000000e+01 3.10000000e+01 2.48000000e-01 2.60000000e+01
  1.00000000e+00]
 [1.00

To increase the accuracy of machine learning and data analysis, it is better to separate the last column, which is the dependent variable, from the independent variables.

**Now we are going to use "MinMax Scaler" method.**

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)
data

array([[0.35294118, 0.74371859, 0.59016393, 0.30434783, 0.1338141 ,
        0.50074516, 0.23441503, 0.48333333, 1.        ],
       [0.05882353, 0.42713568, 0.54098361, 0.23913043, 0.06330128,
        0.39642325, 0.11656704, 0.16666667, 0.        ],
       [0.47058824, 0.91959799, 0.52459016, 0.25      , 0.21754808,
        0.34724292, 0.25362938, 0.18333333, 1.        ],
       [0.05882353, 0.44723618, 0.54098361, 0.17391304, 0.09615385,
        0.41877794, 0.03800171, 0.        , 0.        ],
       [0.        , 0.68844221, 0.32786885, 0.30434783, 0.18509615,
        0.64232489, 0.94363792, 0.2       , 1.        ],
       [0.29411765, 0.58291457, 0.60655738, 0.12318841, 0.11418269,
        0.38152012, 0.05251921, 0.15      , 0.        ],
       [0.17647059, 0.3919598 , 0.40983607, 0.27173913, 0.08894231,
        0.46199702, 0.07258753, 0.08333333, 1.        ],
       [0.58823529, 0.57788945, 0.        , 0.25362319, 0.07451923,
        0.52608048, 0.02391119, 0.13333333, 0.        ],


**and we can use "Standard Score" method.**

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(data)
data

array([[ 6.39947260e-01,  8.48323795e-01,  1.49640753e-01,
         6.25703208e-01, -2.69421175e-01,  2.04012771e-01,
         4.68491977e-01,  1.42599540e+00,  1.36589591e+00],
       [-8.44885053e-01, -1.12339636e+00, -1.60545747e-01,
        -4.42144760e-03, -8.50042043e-01, -6.84421946e-01,
        -3.65060778e-01, -1.90671905e-01, -7.32120209e-01],
       [ 1.23388019e+00,  1.94372388e+00, -2.63941247e-01,
         1.00599328e-01,  4.20066105e-01, -1.10325546e+00,
         6.04397318e-01, -1.05584152e-01,  1.36589591e+00],
       [-8.44885053e-01, -9.98207780e-01, -1.60545747e-01,
        -6.34546104e-01, -5.79525502e-01, -4.94043078e-01,
        -9.20762614e-01, -1.04154944e+00, -7.32120209e-01],
       [-1.14185152e+00,  5.04055196e-01, -1.50468724e+00,
         6.25703208e-01,  1.52848546e-01,  1.40974560e+00,
         5.48490910e+00, -2.04963989e-02,  1.36589591e+00],
       [ 3.42980797e-01, -1.53184856e-01,  2.53036252e-01,
        -1.12464306e+00, -4.31071303e-01, -8.113411

# And since I'm sure everyone is familiar with coding, it is the responsibility of the data analyst to try various methodologies in order to achieve a more accurate analysis.