In [402]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

# import data and shuffle it
data = pd.read_csv('../data/Pima_Indian_diabetes.csv');
data = data.sample(frac=1, random_state=15).reset_index(drop=True)
data_shape = data.shape
data_columns = data.columns

In [403]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,742.0,752.0,768.0,746.0,768.0,757.0,768.0,749.0,768.0
mean,3.866601,119.966097,68.886078,20.309879,79.799479,31.711151,0.471876,33.761336,0.348958
std,3.479971,32.367659,19.427448,15.974523,115.244002,8.544789,0.331329,12.297409,0.476951
min,-5.412815,0.0,-3.496455,-11.94552,0.0,-16.288921,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.1,0.24375,24.0,0.0
50%,3.0,116.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.25,36.5,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [404]:
data_neg_changed = data.assign(
                               Pregnancies = lambda x: x.Pregnancies.where(x.Pregnancies.ge(0)),
                               Glucose = lambda x: x.Glucose.where(x.Glucose.gt(0)),
                               BloodPressure = lambda x: x.BloodPressure.where(x.BloodPressure.gt(0)),
                               SkinThickness = lambda x: x.SkinThickness.where(x.SkinThickness.gt(0)),
                               Insulin = lambda x: x.Insulin.where(x.Insulin.gt(0)),
                               BMI = lambda x: x.BMI.where(x.BMI.gt(0))
                              )

In [405]:
print(data_neg_changed.shape)
print(data_neg_changed.isnull().sum())

(768, 9)
Pregnancies                  29
Glucose                      21
BloodPressure                35
SkinThickness               242
Insulin                     374
BMI                          25
DiabetesPedigreeFunction      0
Age                          19
Outcome                       0
dtype: int64


In [406]:
data_neg_changed.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,739.0,747.0,733.0,526.0,394.0,743.0,768.0,749.0,768.0
mean,3.902135,120.769083,72.185074,28.874152,155.548223,32.359164,0.471876,33.761336,0.348958
std,3.44181,30.944811,12.50221,10.598375,118.775855,7.152753,0.331329,12.297409,0.476951
min,0.0,42.974768,15.372031,7.0,14.0,5.317899,0.078,21.0,0.0
25%,1.0,99.0,64.0,21.0,76.25,27.45,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [407]:
data_neg_changed.drop(['SkinThickness', 'Insulin'], axis=1, inplace=True)
data_neg_changed.shape

(768, 7)

In [408]:
data_neg_changed.dropna(axis=0, how='any', inplace=True)
data_neg_changed.shape

(650, 7)

In [409]:
data_neg_changed.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
count,650.0,650.0,650.0,650.0,650.0,650.0,650.0
mean,3.910291,120.894412,72.120981,32.386063,0.477469,33.632595,0.341538
std,3.439186,30.837428,12.290524,7.054841,0.341095,11.821947,0.474591
min,0.0,42.974768,15.372031,6.699051,0.078,21.0,0.0
25%,1.0,99.0,64.0,27.5,0.245,24.0,0.0
50%,3.0,117.0,72.0,32.4,0.377,29.0,0.0
75%,6.0,140.0,80.0,36.6,0.6285,41.0,1.0
max,17.0,199.0,110.0,67.1,2.42,69.312541,1.0


In [410]:
data_neg_changed['Outcome'].value_counts()

0    428
1    222
Name: Outcome, dtype: int64

In [411]:
X = data_neg_changed.iloc[:, :-1]
y = data_neg_changed['Outcome']

In [412]:
feature_cols = X.columns
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(X)
X = pd.DataFrame(X, columns=feature_cols)

In [413]:
X.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age
count,650.0,650.0,650.0,650.0,650.0,650.0
mean,0.230017,0.499404,0.599706,0.425275,0.170568,0.261477
std,0.202305,0.197644,0.129883,0.1168,0.145643,0.244697
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.058824,0.359078,0.513886,0.344381,0.071307,0.062096
50%,0.176471,0.474444,0.598427,0.425506,0.127669,0.165588
75%,0.352941,0.621856,0.682969,0.495041,0.235056,0.413971
max,1.0,1.0,1.0,1.0,1.0,1.0


In [414]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=10)

In [415]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param = {'C':[1,2,3,4,5,6,7,8,9,10]}
lrcv = LogisticRegression()
clf = GridSearchCV(lrcv, param, cv=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7384615384615385

In [416]:
from sklearn import svm, datasets
# from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
param = {'cv':[1,2,3,4,5], 'Cs':[1,2,3,4,5,6,7,8,9,10]}
lrcv = LogisticRegressionCV()
clf = GridSearchCV(lrcv, param, cv=5)
clf.fit(iris.data, iris.target)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.

In [None]:
# # replacing with median of that column
# for col in data.columns:
#     if(col != 'Outcome'):
#         data[col].fillna(data[col].median(), inplace=True)

In [None]:
# x = data.iloc[:, :-1]
# Y = data['Outcome']

In [None]:
# # ensuring that datatype of all columns is float64
# x.dtypes

In [None]:
# # scale the feature columns
# cols = x.columns
# # using minmax scaler as units of feature columns are different and this brings them between 0 and 1
# # mm_scaler = MinMaxScaler() 
# std_scaler = StandardScaler()
# x = std_scaler.fit_transform(x)
# x = pd.DataFrame(x, columns=cols)
# x.describe()

In [None]:
# # split training and testing data
# X_train, X_test, y_train, y_test = train_test_split(x, Y, test_size=0.3, random_state=15, shuffle=True)

In [None]:
# y_train.value_counts()

In [None]:
# # logistic regression
# clf = LogisticRegressionCV(cv=5, random_state=15).fit(X_train, y_train)
# clf.score(X_test, y_test)

# # SVM
# # clf = SVC()
# # clf.fit(X_train, y_train)
# # clf.score(X_test, y_test)

# # SGDClassifier
# # clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
# # clf.fit(X_train, y_train)
# # clf.score(X_test, y_test)

# # KNN classifier
# # neigh = KNeighborsClassifier(n_neighbors=3)
# # neigh.fit(X_train, y_train)
# # neigh.score(X_test, y_test)