In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Continous data 
We use histogram to visualize it.

In [39]:
fig = px.histogram(data_frame= data, x = 'charges')
fig.update_layout(
    title = "Insurance Charges",
    xaxis = dict(tickformat = ".0f")
    #yaxis = dict(tickformat = ".2f" )
)
fig.show()

In [41]:
fig = px.histogram(data_frame=data ,x = 'charges' , color = 'smoker' )
fig.show()


In [42]:
fig = px.histogram(data_frame= data, x = 'charges', color = 'sex' )
fig.show()


In [51]:
fig = px.histogram(data_frame = data,x = 'charges' ,facet_col= 'smoker' )
fig.show()

In [52]:
fig = px.histogram(data_frame = data,x = 'charges' ,color= 'smoker', facet_col='sex' )
fig.show()

# Detect outliers for Continous Variables:
we use box-plot to visualize IQR equations 

study from these links to deeply understand:

In [55]:
fig = px.box(data_frame = data, x = 'charges' )
fig.update_layout(
    xaxis = dict(tickformat = ".0f")
)
fig.show()

we use y attribute to to make n-boxplots depends on y variable 

it'll be useful when y variable is categorical

In [59]:
fig = px.box(data_frame= data , 
            x = 'charges',
            y = 'smoker' 
)
fig.update_layout(
    xaxis = dict(tickformat = ".0f")
)
fig.show()

In [61]:
fig = px.box(data_frame= data, x = 'charges', y = 'sex' )
fig.update_layout(
    xaxis = dict(tickformat = ".0f")
)
fig.show()

In [64]:
fig = px.box(data_frame= data , x = 'charges', y = 'smoker' ,color = 'sex')
fig.update_layout(
    xaxis = dict(tickformat = ".0f")
)
fig.show()

In [65]:
fig = px.box(data_frame= data,y = 'charges', x= 'children', color= 'sex')
fig.show()

# Vilion chart: to embeded [Histplot, Distrubtion] in one figure

In [None]:
#vilion charts::contious distrubtion on y 
fig = px.violin(data_frame= data,y = 'charges')
fig.show()

In [69]:
fig = px.violin(data_frame= data , y = 'charges',x = 'smoker', box= True)
fig.show()

In [73]:
fig = px.violin(data_frame= data,
                y = 'charges',
                x= 'smoker', 
                color = 'sex' ,
                box = True,
                hover_data=data.columns)
fig.show()

In [87]:
data.select_dtypes(include ='object').columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [97]:
fig = make_subplots(rows = 1 ,cols =2 )
trace0 = go.Violin(
                y = data['charges'],
                x = data['region'],
                box_visible = True,
                showlegend= True
                )


trace1 = go.Violin(
                y = data['charges'],
                x =data['smoker'],
                box_visible = True,
                showlegend= True )
fig.update_layout(
    title_text =  "Violin Plots for Charges with Region, Smoker",
    xaxis_title = "Region",
    xaxis2_title = "Smoker",
    yaxis_title = "Charges"
)
fig.add_trace(trace0, 1,1)
fig.add_trace(trace1, 1,2)
fig.show()

# Scatter Plot : to visualize the relationship between two continous Variables

In [99]:
fig = px.scatter(data_frame= data, x ="age" , y= "charges")
fig.show()

In [101]:
fig = px.scatter(data_frame= data, x = 'age', y = 'charges', color = 'smoker' )
fig.show()

In [102]:
fig = px.scatter(data_frame = data ,x = 'bmi', y = 'charges' )
fig.show()

In [103]:
fig = px.scatter(data_frame = data,  x = 'bmi', y = 'charges',color = 'smoker')
fig.show()

In [108]:
fig = px.scatter(data_frame =data , x = 'bmi', 
                y = 'charges',
                color= 'smoker', 
                symbol= 'sex')
fig.show()

In [109]:
fig = px.scatter(data_frame= data, 
           x = 'bmi',
           y = 'charges',
           color = 'smoker',
           facet_col= 'sex')
fig.show()

# make_subplos with graph_objects to embeded more than chart in one figure

In [None]:
fig = make_subplots(rows = 1, cols = 2)
trace1 = go.Scatter(x = data['bmi'], y= data['charges'], mode= 'markers')
trace2 = go.Scatter(x = data['age'], y= data['charges'], mode='markers')
fig.add_trace(trace1 , row = 1, col= 1 )
fig.add_trace(trace2 , row = 1, col= 2)
fig.update_layout(
    title_text =  "Scatter Plots for Charges with bmi, Age",
    xaxis_title = "bmi",
    xaxis2_title = "Age",
    yaxis_title = "Charges"
)
fig.show()

# Visualization Categorical Columns:
The most convient plot is countplot and its attributes:

In [119]:
#find the categorical columns:
data.select_dtypes(include= 'object' ).columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [120]:
data['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [121]:
px.histogram(data_frame =data, x = 'sex' )

In [126]:
px.histogram(data, x = 'sex',color= 'smoker', barmode='group')

In [128]:
px.histogram(data, x = 'region', color = 'smoker', barmode = 'group')

In [131]:
px.histogram(data , x = 'smoker' , color = 'region', barmode = 'group')

In [None]:
continous_features = data.select_dtypes(include= 'object' ).columns
n = len(continous_features)
fig = make_subplots(rows = 1, cols = n)

for i, column in enumerate(continous_features):
    fig.add_trace(
                go.Histogram(x = data[column]),
                row = 1, 
                col = i+1)
fig.update_layout(title_text="Histograms of Categorical Features",
                  yaxis_title = "Count")
fig.show()

# Visualize the effect of Log Scaline Transformation

In [None]:
#from Distrubtion plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Histogram(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Histogram(x = np.log10(data['charges']),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After Log Transformation",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title="log10 (Charges)" )

fig.show()

In [161]:
#from Box plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Box(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Box(x =  np.log10(data['charges']),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After Transformation ",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" log10 (Charges)" )

fig.show()#the distrubtion not changes but the range is compressed

# Visualize The effect of StandardScaler Scaling:

In [155]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X = data['charges'].values.reshape(-1,1))
target_scaler  = scaler.transform(X = data['charges'].values.reshape(-1,1))

In [None]:
#from Distrubtion plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Histogram(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Histogram(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After StandardScaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

In [157]:
#from Box plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Box(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Box(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After StandardScaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

# Visualize the effect of Min-max transformation

In [158]:
#min max transformation:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data['charges'].values.reshape(-1,1))
target_scaler = scaler.transform(data['charges'].values.reshape(-1,1))

In [159]:
#from Distrubtion plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Histogram(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Histogram(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After MinMax Scaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

In [160]:
#from Box plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Box(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Box(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After MinMax Scaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

# Visualize the effect of Robust Scaler transformation

In [3]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X = data['charges'].values.reshape(-1,1))
target_scaler  = scaler.transform(X = data['charges'].values.reshape(-1,1))

In [4]:
#from Distrubtion plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Histogram(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Histogram(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After Robust Scaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

In [5]:
#from Box plot:
fig = make_subplots(1,2)
fig.add_trace( 
                go.Box(x = data['charges'],
                            name = 'Before' ),
                row = 1 ,
                col = 1,
                ) 
fig.add_trace( 
    go.Box(x = target_scaler.reshape(-1),
                            name = "After"
    ),
    row = 1,
    col =2 
)

fig.update_layout(title_text="Before and After Robust Scaler",
                  yaxis_title = "Count",
                  xaxis_title = "Charges",
                  xaxis2_title=" Scaled Charges" )

fig.show()#the distrubtion not changes but the range is compressed

In [36]:
# Histogram + KDE:
import plotly.figure_factory as ff 
fig = ff.create_distplot(
    hist_data= [data['charges'].values] , 
    group_labels= ['charges'], 
    bin_size= 50,
    show_rug = False
    )
fig.update_layout(
    title = "Insurance Charges",
    xaxis_title = "Charges",
    yaxis_title = "Count",
    xaxis = dict(tickformat = ".0f"),
    #yaxis = dict(tickformat = ".2f" )
                )
fig.show()