In [1]:
# Linear Algebra
import numpy as np

# Data Processing
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Algorithm 

from sklearn.svm import SVC, LinearSVC

In [2]:
import io
kidney_data = pd.read_csv('dataset.csv')
print(kidney_data.head())

   id  age    bp     sg   al   su     rbc        pc         pcc          ba  \
0   0   48  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   1    7  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2   2   62  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3   3   48  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4   4   51  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

   ...   rc  htn   dm  cad  appet   pe  ane classification        eGFR stages  
0  ...  5.2  yes  yes   no   good   no   no            ckd   62.001841      2  
1  ...  NaN   no   no   no   good   no   no            ckd  146.335929      1  
2  ...  NaN   no  yes   no   poor   no  yes            ckd   36.866485      3  
3  ...  3.9  yes   no   no   poor  yes  yes            ckd   16.394927      4  
4  ...  4.6   no   no   no   good   no   no            ckd   51.262906      3  

[5 rows x 28 columns]


In [3]:
kidney_data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification', 'eGFR', 'stages'],
      dtype='object')

In [4]:
kidney_data.drop(['id','classification','eGFR'],axis=1,inplace=True)

In [5]:
kidney_data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'stages'],
      dtype='object')

In [6]:
# Understanding the Attributes

In [7]:
'''Understanding the Attributes
age - age of patient
bp - blood pressure level of patient
sg - Specific gravity is the ratio of the density of the substance to the density of a reference substance.
al - Albumin ( It is a type of protein the liver makes. It's one of the most abundant proteins in the blood. We need a proper balance of albumin to keep fluid from leaking out of blood vessels.)
su - Sugar level in the blood
rbc - It refers to Red blood cells in the blood.
pc - Pus is the result of the body's natural immune system automatically responding to an infection, usually caused by bacteria or fungi.
pcc - Pyuria is a condition that occurs when excess white blood cells, or pus, are present in the urine.Parasites, kidney stones, tumors and cysts, and interstitial cystitis can also lead to pyuria.
ba - Bacteria
bgr - The reference values for a "normal" random glucose test in an average adult are 79–140mg/dl (4.4–7.8 mmol/l), between 140-200mg/dl (7.8–11.1 mmol/l) is considered pre-diabetes, and ≥ 200 mg/dl is considered diabetes according to ADA guidelines
bu - Nitrogen in the blood that comes from urea (a substance formed by the breakdown of protein in the liver). The kidneys filter urea out of the blood and into the urine. A high level of urea nitrogen in the blood may be a sign of a kidney problem.
sc - Serum Creatinine ( Creatinine is a breakdown product of creatinine phosphate in muscle, and is usually produced ata)
sod - Sodium (sod in mEq/L)
pot - Potassium (pot in mEq/L)
hemo - Hemoglobin (hemo in gms)
pcv - Packed Cell Volume
wc - White Blood Cell Count (wc in cells/cumm)
rc - Red Blood Cell Count(rc in millions/cumm)
htn - Hypertension (It is also known as high blood pressure(HBP) is a long-term medical condition in which the blood pressure in the arteries is persistently elevated.)
dm - Diabetes Mellitus(A disease in which the body’s ability to produce or respond to the hormone insulin is impaired, resulting in abnormal metabolism of carbohydrates and elevated levels of glucose in the blood.)
cad - Coronary Artery Disease (It happens when the arteries that supply blood to heart muscle become hardened and narrowed.)
appet - Appetite (A natural desire to satisfy a bodily need, especially for food)
pe - Pedal Edema( It is the accumulation of fluid in the feet and lower legs. )
ane - Anemia (A condition in which there is a deficiency of red cells or of haemoglobin in the blood, resulting in pallor and weariness.)
classification- It classifies whether a person is suffering from chronic kidney disease or not.

'''

'Understanding the Attributes\nage - age of patient\nbp - blood pressure level of patient\nsg - Specific gravity is the ratio of the density of the substance to the density of a reference substance.\nal - Albumin ( It is a type of protein the liver makes. It\'s one of the most abundant proteins in the blood. We need a proper balance of albumin to keep fluid from leaking out of blood vessels.)\nsu - Sugar level in the blood\nrbc - It refers to Red blood cells in the blood.\npc - Pus is the result of the body\'s natural immune system automatically responding to an infection, usually caused by bacteria or fungi.\npcc - Pyuria is a condition that occurs when excess white blood cells, or pus, are present in the urine.Parasites, kidney stones, tumors and cysts, and interstitial cystitis can also lead to pyuria.\nba - Bacteria\nbgr - The reference values for a "normal" random glucose test in an average adult are 79–140mg/dl (4.4–7.8 mmol/l), between 140-200mg/dl (7.8–11.1 mmol/l) is considere

In [8]:
#Data Types of the data

kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    int64  
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      400 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     330 non-null    object 
 16  wc      295 non-null    object 
 17  rc      270 non-null    object 
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

In [9]:
# Categorical and Numerical Variables
# Categorical variable : rbc, pc, pcc, ba, pcv, wc, rc, htn, dm, cad, appet, pe, ane and classification
# Numerical variable : age, bp , sg, al, su, bgr, bu, sc, sod, pot and hemo.

In [10]:
kidney_data.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,stages
count,400.0,388.0,353.0,354.0,351.0,356.0,381.0,400.0,313.0,312.0,348.0,400.0
mean,51.5625,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,2.997125,137.528754,4.627244,12.526437,2.7775
std,16.982996,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.628886,10.408752,3.193904,2.912587,1.386651
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,1.0
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3,2.0
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65,3.0
75%,64.0,80.0,1.02,2.0,0.0,163.0,66.0,2.725,142.0,4.9,15.0,4.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,5.0


In [11]:
# Count of null values
kidney_data.isnull().sum()

age         0
bp         12
sg         47
al         46
su         49
rbc       152
pc         65
pcc         4
ba          4
bgr        44
bu         19
sc          0
sod        87
pot        88
hemo       52
pcv        70
wc        105
rc        130
htn         2
dm          2
cad         2
appet       1
pe          1
ane         1
stages      0
dtype: int64

In [12]:
# Extracting null cols
null_cols = kidney_data.columns[kidney_data.isnull().any()]


In [13]:
null_cols

Index(['bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sod',
       'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe',
       'ane'],
      dtype='object')

In [14]:
# Replacing columns having null values using pad
for i in null_cols:
    kidney_data[i].fillna(method = 'pad',inplace = True)
    kidney_data[i].fillna(method = 'bfill',inplace = True)

In [15]:
kidney_data.isna().sum().sum()

0

In [16]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    int64  
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   rbc     400 non-null    object 
 6   pc      400 non-null    object 
 7   pcc     400 non-null    object 
 8   ba      400 non-null    object 
 9   bgr     400 non-null    float64
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    object 
 16  wc      400 non-null    object 
 17  rc      400 non-null    object 
 18  htn     400 non-null    object 
 19  dm      400 non-null    object 
 20  cad     400 non-null    object 
 21  appet   400 non-null    object 
 22  pe

In [17]:
categorical_columns = ['rbc','pc','pcc','ba','pcv','wc','rc','htn','dm','cad','appet','pe','ane']

In [18]:
# Encoding Categorical Features

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for i in categorical_columns:
    kidney_data[i] = le.fit_transform(kidney_data[i])

In [19]:
kidney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    int64  
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   rbc     400 non-null    int32  
 6   pc      400 non-null    int32  
 7   pcc     400 non-null    int32  
 8   ba      400 non-null    int32  
 9   bgr     400 non-null    float64
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    int32  
 16  wc      400 non-null    int32  
 17  rc      400 non-null    int32  
 18  htn     400 non-null    int32  
 19  dm      400 non-null    int32  
 20  cad     400 non-null    int32  
 21  appet   400 non-null    int32  
 22  pe

In [20]:
#Normalizing features
kidney_data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'stages'],
      dtype='object')

In [21]:
kidney_data.head(
)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,stages
0,48,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,32,72,31,1,4,1,0,0,0,2
1,7,50.0,1.02,4.0,0.0,1,1,0,0,121.0,...,26,56,31,0,3,1,0,0,0,1
2,62,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,19,70,31,0,4,1,1,0,1,3
3,48,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,20,62,18,1,3,1,1,1,1,4
4,51,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,23,68,25,0,3,1,0,0,0,3


In [22]:
#kidney_data.to_csv('newdataset.csv')

In [23]:
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()
# for i in kidney_data.columns:
#     kidney_data[i] = pd.DataFrame(i:fit_transform(kidney_data[i]))
col = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane']

kidney_data[col] = normalizer.fit_transform(kidney_data[col].values)

In [24]:
kidney_data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,stages
0,-0.210031,0.27814,0.489013,-0.042868,-0.415581,0.612022,0.542677,-0.342518,-0.241249,-0.366277,...,0.691518,0.604252,0.734394,1.311903,1.209416,-0.276686,-0.507801,-0.484322,-0.420084,2
1,-2.627234,-1.90335,0.489013,2.193701,-0.415581,0.612022,0.542677,-0.342518,-0.241249,-0.366277,...,0.049738,0.00066,0.734394,-0.762252,-0.518321,-0.276686,-0.507801,-0.484322,-0.420084,1
2,0.615355,0.27814,-1.257462,0.702655,2.354961,0.612022,0.542677,-0.342518,-0.241249,3.486585,...,-0.699005,0.528803,0.734394,-0.762252,1.209416,-0.276686,1.969276,-0.484322,2.380476,3
3,-0.210031,-0.449023,-2.1307,2.193701,-0.415581,0.612022,-1.842717,2.919556,-0.241249,-0.417308,...,-0.592042,0.227007,-0.562329,1.311903,-0.518321,-0.276686,1.969276,2.064742,2.380476,4
4,-0.033163,0.27814,-1.257462,0.702655,-0.415581,0.612022,0.542677,-0.342518,-0.241249,-0.557644,...,-0.271152,0.453354,0.135907,-0.762252,-0.518321,-0.276686,-0.507801,-0.484322,-0.420084,3


In [25]:
#Model Building

kidney_data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'stages'],
      dtype='object')

In [26]:
x = kidney_data[['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane']]
y = kidney_data['stages']

In [27]:
print(len(x))
print(len(y))

400
400


In [28]:
# Splitting data
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [29]:
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

320
80
320
80


In [30]:
y.value_counts()

2    93
1    93
3    90
5    66
4    58
Name: stages, dtype: int64

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [32]:
classifier = SVC(gamma = 0.01,C=100)
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print("Accuracy score:",accuracy_score(y_test,y_pred))
print("Confusion Matrix :")
print(confusion_matrix(y_test,y_pred))

Accuracy score: 0.6625
Confusion Matrix :
[[ 8  6  1  0  0]
 [ 5 12  2  0  0]
 [ 2  2  8  1  0]
 [ 0  0  6 10  1]
 [ 0  1  0  0 15]]


In [33]:
classifier.get_params()

{'C': 100,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.01,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
print(y_pred)

[5 2 2 5 5 4 1 2 5 3 4 5 3 2 4 5 2 3 4 3 2 1 4 2 4 1 4 5 2 1 3 2 2 4 1 5 4
 2 3 2 1 1 5 1 2 4 3 2 3 3 2 3 3 3 1 2 2 3 3 5 5 2 3 1 5 1 5 3 5 2 2 1 3 5
 4 5 1 1 1 2]


In [35]:
classifier = SVC(gamma = 0.01,C=100,kernel='linear')
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print("Accuracy score:",accuracy_score(y_test,y_pred))
print("Confusion Matrix :")
print(confusion_matrix(y_test,y_pred))

Accuracy score: 0.7875
Confusion Matrix :
[[13  2  0  0  0]
 [ 2 17  0  0  0]
 [ 0  3  9  0  1]
 [ 0  2  3 10  2]
 [ 0  0  1  1 14]]
