This notebook will be utilizing Pima Indians Diabetes Dataset to practice beginner level pandas

In [1]:
import pandas as pd

In [3]:
# loading the data
df = pd.read_csv('datasets/pima_diabetes.csv')

In [4]:
# viewing the first 5 records
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# viewing the last 5 records
df.tail(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [7]:
# number of rows and columns
print(f'Number of rows: {df.shape[0]}')
print(f'Number of columns: {df.shape[1]}')

Number of rows: 768
Number of columns: 9


In [8]:
# data types of columns
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [10]:
columns = df.columns

In [13]:
#converting the column names to lowercase using list comprehension
columns = [item.lower() for item in columns]

In [17]:
# renaming all columns columns
df.columns = columns

In [19]:
# renaming specific column
df.rename(columns={
    'pregnancies':'pregs'
}, inplace=True)

In [20]:
df.loc

Index(['pregs', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi',
       'diabetespedigreefunction', 'age', 'outcome'],
      dtype='object')

In [28]:
# indexing using iloc
# iloc[row_number, column_number]
df.iloc[0,0]

6

In [31]:
# indexing using loc
# loc[row_label, column_label
df.loc[0,'pregs']

6

In [39]:
# Selecting rows
# getting all columns of 0th through 9th row
df.iloc[0:11,:]

Unnamed: 0,pregs,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [40]:
df.iloc[[1,4,5]]

Unnamed: 0,pregs,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
1,1,85,66,29,0,26.6,0.351,31,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [42]:
# Finding sum of missing values
df.isna().sum()

pregs                       0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64

In [44]:
# descriptive statistics
df.describe()

Unnamed: 0,pregs,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [45]:
# dropping column in pandas df
df.drop('pregs', axis=1, inplace=True)

In [46]:
df.head(5)

Unnamed: 0,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1


In [52]:
# selecting rows based on condition
df.loc[df['glucose']>89, 'outcome']

0      1
2      1
4      1
5      0
7      0
      ..
763    0
764    0
765    0
766    1
767    0
Name: outcome, Length: 664, dtype: int64

In [56]:
# creating new column from existing
df['glucose_blood_pressure_ratio'] = df['glucose']/(df['bloodpressure']+1)

In [57]:
df.columns

Index(['glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi',
       'diabetespedigreefunction', 'age', 'outcome',
       'glucose_blood_pressure_ratio'],
      dtype='object')

In [58]:
# sorting df based on specific columns
df.sort_values(by='bloodpressure', ascending=True)

Unnamed: 0,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome,glucose_blood_pressure_ratio
347,116,0,0,0,23.5,0.187,23,0,116.000000
494,80,0,0,0,0.0,0.174,22,0,80.000000
222,119,0,0,0,25.2,0.209,37,0,119.000000
81,74,0,0,0,0.0,0.102,22,0,74.000000
78,131,0,0,0,43.2,0.270,26,1,131.000000
...,...,...,...,...,...,...,...,...,...
549,189,110,31,0,28.5,0.680,37,0,1.702703
43,171,110,24,240,45.4,0.721,54,1,1.540541
177,129,110,46,130,67.1,0.319,26,1,1.162162
691,158,114,0,0,42.3,0.257,44,1,1.373913


In [69]:
# sorting by outcome and calculating mean
df.groupby('outcome').agg(
    mean_glucose=('glucose','mean'),
    min_max_glucose = ('glucose', lambda x:x.max()-x.min())
)

Unnamed: 0_level_0,mean_glucose,min_max_glucose
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
0,109.98,197
1,141.257463,199
