In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



## Load Data


In [5]:
insurance_data = pd.read_csv("../dataset/insurance.csv")
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [6]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
insurance_data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


## Numbers of Records and columns

In [8]:
insurance_data.shape

(1338, 7)

In [11]:
insurance_data.columns


Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [12]:
insurance_data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

### Data Information

In [13]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Checking for Duplicate Records

In [14]:
insurance_data.duplicated().any()

np.True_

In [16]:
insurance_data[insurance_data.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [17]:
insurance_data1 = insurance_data.drop_duplicates()

In [18]:
insurance_data1.shape

(1337, 7)

### Checking for Missing Values

In [19]:
insurance_data1.isnull().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [20]:
insurance_data1.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### List of Categorical and Numeric Columns

In [21]:
Numerical_columns = insurance_data1.select_dtypes(include="number").columns.to_list()
Categorical_columns = insurance_data1.select_dtypes(exclude = "number").columns.to_list()
print(f"Numerical columns in the data:{Numerical_columns}")
print(f"Categorical columns in the data:{Categorical_columns}")

Numerical columns in the data:['age', 'bmi', 'children', 'charges']
Categorical columns in the data:['sex', 'smoker', 'region']


####  Statistical Measure of Numeric Columns

In [22]:
insurance_data1.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


####  Statistical Measure of Categorical Columns

In [23]:
insurance_data1.describe(exclude="number")

Unnamed: 0,sex,smoker,region
count,1337,1337,1337
unique,2,2,4
top,male,no,southeast
freq,675,1063,364


In [24]:
insurance_data1.sex.unique()

array(['female', 'male'], dtype=object)

In [26]:
insurance_data1.sex.nunique()

2

In [25]:
insurance_data1.sex.value_counts()

sex
male      675
female    662
Name: count, dtype: int64

In [28]:
print(insurance_data1.smoker.unique())
print()
print(insurance_data1.smoker.nunique())
print()
print(insurance_data1.smoker.value_counts())
print()


['yes' 'no']

2

smoker
no     1063
yes     274
Name: count, dtype: int64



In [29]:
print(insurance_data1.region.unique())
print()
print(insurance_data1.region.nunique())
print()
print(insurance_data1.region.value_counts())
print()


['southwest' 'southeast' 'northwest' 'northeast']

4

region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64



### Data Visualization

##### Histogram Numeric columns

In [30]:
Numerical_columns

['age', 'bmi', 'children', 'charges']

In [None]:
insurance_data1['age'].plot(kind = 'hist')