In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_csv("kidney_disease.csv")

In [3]:
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
id                400 non-null int64
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe         

In [7]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [8]:
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
df.rename(columns={'classification':'class'},inplace=True)

In [9]:
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,0,48.0,80.0,1.020,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1,0.0,0.0,1
1,1,7.0,50.0,1.020,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1,0.0,0.0,1
2,2,62.0,80.0,1.010,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0,0.0,1.0,1
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0,1.0,1.0,1
4,4,51.0,80.0,1.010,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,...,47,6700,4.9,0.0,0,0,1,0.0,0.0,0
396,396,42.0,70.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,...,54,7800,6.2,0.0,0,0,1,0.0,0.0,0
397,397,12.0,80.0,1.020,0.0,0.0,0.0,0.0,0.0,0.0,...,49,6600,5.4,0.0,0,0,1,0.0,0.0,0
398,398,17.0,60.0,1.025,0.0,0.0,0.0,0.0,0.0,0.0,...,51,7200,5.9,0.0,0,0,1,0.0,0.0,0


In [10]:
df['pe'] = df['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df.drop('id',axis=1,inplace=True)

In [11]:
df=df.drop(["su","rbc","rc","wc","pot","sod"],axis=1)
df["pcv"]=df["pcv"].fillna(method="ffill")
df.drop(["pc"],axis=1,inplace=True)
df["hemo"]=df["hemo"].fillna(method="ffill")
df.drop(["sg"],axis=1,inplace=True)
df=df.fillna(method="ffill")


In [12]:
df.drop(["ba"],axis=1,inplace=True)
df.drop(["pe"],axis=1,inplace=True)
df.drop(["cad"],axis=1,inplace=True)
df.drop(["ane"],axis=1,inplace=True)

In [13]:
df=df.replace("\t?",31)
print(df.columns)

Index(['age', 'bp', 'al', 'pcc', 'bgr', 'bu', 'sc', 'hemo', 'pcv', 'htn', 'dm',
       'appet', 'class'],
      dtype='object')


In [14]:
print(df.shape[1])

13


In [15]:
df

Unnamed: 0,age,bp,al,pcc,bgr,bu,sc,hemo,pcv,htn,dm,appet,class
0,48.0,80.0,1.0,0.0,121.0,36.0,1.2,15.4,44,1.0,1,1.0,1.0
1,7.0,50.0,4.0,0.0,121.0,18.0,0.8,11.3,38,0.0,0,1.0,1.0
2,62.0,80.0,2.0,0.0,423.0,53.0,1.8,9.6,31,0.0,1,0.0,1.0
3,48.0,70.0,4.0,1.0,117.0,56.0,3.8,11.2,32,1.0,0,0.0,1.0
4,51.0,80.0,2.0,0.0,106.0,26.0,1.4,11.6,35,0.0,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,0.0,0.0,140.0,49.0,0.5,15.7,47,0.0,0,1.0,0.0
396,42.0,70.0,0.0,0.0,75.0,31.0,1.2,16.5,54,0.0,0,1.0,0.0
397,12.0,80.0,0.0,0.0,100.0,26.0,0.6,15.8,49,0.0,0,1.0,0.0
398,17.0,60.0,0.0,0.0,114.0,50.0,1.0,14.2,51,0.0,0,1.0,0.0


In [16]:
target=df["class"]
source=df.drop(["class"],axis=1)

In [17]:
X_train,X_test,y_train,y_test=train_test_split(source,target,test_size=0.05)


In [18]:
sm=SMOTE()
X_train, y_train =sm.fit_sample(X_train,y_train)

In [19]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [20]:
a11=cross_validate(lr,source,target, cv=10)
print(a11)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'fit_time': array([0.09177804, 0.07529783, 0.06445861, 0.06138039, 0.08129072,
       0.08124661, 0.08322239, 0.08722138, 0.07132101, 0.06560183]), 'score_time': array([0.01040483, 0.00759554, 0.01145267, 0.00522757, 0.00211096,
       0.00104165, 0.        , 0.00803924, 0.01570773, 0.        ]), 'test_score': array([1.   , 0.975, 1.   , 1.   , 0.975, 0.975, 0.975, 0.975, 0.975,
       0.975])}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [21]:
import pickle
pickle_out=open("kidney.pkl", "wb")
pickle.dump(lr, pickle_out)
pickle_out.close()