In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Bank Additional/bank_additional_new.csv')
display(df.head())
df.shape

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,y
0,28,blue-collar,single,secondary,no,0.0,yes,no,cellular,4,jul,3,-1,0,1
1,55,blue-collar,married,secondary,no,49.0,yes,no,cellular,7,jul,4,-1,0,1
2,51,technician,married,secondary,no,216.0,no,no,cellular,7,jul,2,-1,0,1
3,39,technician,single,secondary,no,506.0,yes,no,cellular,7,jul,2,-1,0,1
4,42,blue-collar,divorced,primary,no,213.0,yes,no,cellular,7,jul,3,-1,0,1


(8393, 15)

In [10]:
def EDA_num(DF, col, target, alpha_l=1.5, alpha_u=1.5, bins=50, basic_eda=True):
    """
    EDA for Numerical Data using Plotly

    Args:
        DF (pandas.DataFrame): DataFrame containing the data.
        col (str): Name of the numerical column to analyze.
        target (str): Name of the target column.
        basic_eda (bool, optional): Flag to control basic EDA output. Defaults to True.

    Returns:
        tuple: lower_cap, upper_cap
    """
    import plotly.express as px
    import plotly.graph_objects as go
    import pandas as pd
    import numpy as np
    from scipy import stats

    # -Basic EDA
    if basic_eda:
        print('Basic EDA:')
        display(DF[col].describe())
        print('-' * 100)
        display(DF[col].value_counts())
        print('-' * 100)
        print('Skewness:', round(DF[col].skew(), 2))
        print('Kurtosis:', round(DF[col].kurt(), 2))
        print('-' * 100)
        print('Number of Values:', DF[col].nunique())
        print('-' * 100)

    # -Normality Test
    stat, p = stats.shapiro(DF[col])
    normality_result = 'Sample looks Gaussian (fail to reject H0)' if p > 0.05 else 'Sample does not look Gaussian (reject H0)'
    print(f'Is the data distribution normal?: {normality_result}')
    print('-' * 100)

    # -Calculate Lower and Upper Caps
    Q1, Q2, Q3 = DF[col].quantile([0.25, 0.5, 0.75])
    IQR = Q3 - Q1
    lower_cap, upper_cap = Q1 - (alpha_l * IQR), Q3 + (alpha_u * IQR)

    # -Histogram
    hist_fig = px.histogram(DF, x=col, nbins=bins, title=f'Histogram of {col}', color_discrete_sequence=['skyblue'])
    hist_fig.update_layout(xaxis_title=col, yaxis_title='Count', height=400, width=600)

    # -Box and Violin Plot
    box_violin_fig = go.Figure()
    box_violin_fig.add_trace(go.Box(y=DF[col], name='Box Plot', marker=dict(color='blue'), boxmean=True))
    box_violin_fig.add_trace(go.Violin(y=DF[col], name='Violin Plot', box_visible=True, meanline_visible=True, line_color='cornflowerblue'))
    box_violin_fig.update_layout(title=f'Box and Violin Plot of {col}', yaxis_title=col, height=400, width=600)
    # -Scatter Plot and Outliers
    outliers = DF[(DF[col] < lower_cap) | (DF[col] > upper_cap)]
    normal_data = DF[(DF[col] >= lower_cap) & (DF[col] <= upper_cap)]

    # -Combine outliers and normal data into one trace, coloring outliers in red and others in blue
    scatter_fig = go.Figure()

    # -Normal data (blue)
    scatter_fig.add_trace(go.Scatter(x=normal_data[col], y=normal_data[target], mode='markers',
                                     marker=dict(color='blue', size=8), name='Normal Data'))

    # -Outliers (red)
    scatter_fig.add_trace(go.Scatter(x=outliers[col], y=outliers[target], mode='markers',
                                     marker=dict(color='red', size=8), name='Outliers'))

    scatter_fig.add_vline(x=lower_cap, line_dash='dash', line_color='orange', annotation_text='Lower Cap', annotation_position='top left')
    scatter_fig.add_vline(x=upper_cap, line_dash='dash', line_color='green', annotation_text='Upper Cap', annotation_position='top right')
    scatter_fig.update_layout(title=f'Scatter Plot of {col} vs {target}', height=400, width=600)

    # -Bar Plot with Improved Color
    bar_fig = px.bar(DF, x=target, y=col, title=f'Bar Plot of {col} vs {target}', color=target, color_discrete_sequence=px.colors.qualitative.Bold)
    bar_fig.update_layout(height=400, width=600)

    # -Count Plot
    count_fig = px.histogram(DF, x=col, color=target, barmode='group', title=f'Count Plot of {col} by {target}')
    count_fig.update_layout(xaxis_title=col, yaxis_title='Count', height=400, width=600)

    # -Display All Plots
    print("Histogram:")
    hist_fig.show()

    print("Box and Violin Plot:")
    box_violin_fig.show()

    print("Scatter Plot with Outliers:")
    scatter_fig.show()

    # -Now printing outlier stats after scatter plot display
    outlier_count = outliers.shape[0]
    count_outliers_class_0 = DF[DF.index.isin(outliers.index) & (DF[target] == 0)].shape[0]
    count_outliers_class_1 = DF[DF.index.isin(outliers.index) & (DF[target] == 1)].shape[0]

    # -Print the number of outliers
    print(f"Number of outliers in Class 0: {count_outliers_class_0}")
    print(f"Number of outliers in Class 1: {count_outliers_class_1}")
    print(f"Number of outliers: {outlier_count}")
    print('-' * 100)

    print("Bar Plot:")
    bar_fig.show()

    print("Count Plot:")
    count_fig.show()

    return lower_cap, upper_cap


In [17]:
import plotly.io as pio
import plotly.graph_objects as go
def EDA_num_save_plots(DF, col, target, alpha_l=1.5, alpha_u=1.5, bins=50, basic_eda=True, save_dir='./plots'):
    """
    EDA for Numerical Data using Plotly, but with saving the plots as HTML files.
    """
    # Create save directory if it doesn't exist
    import os
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Call original function to create the plots
    lower_cap, upper_cap = EDA_num(DF, col, target, alpha_l, alpha_u, bins, basic_eda)

    # Save each plot as HTML
    hist_fig = px.histogram(DF, x=col, nbins=bins, title=f'Histogram of {col}', color_discrete_sequence=['skyblue'])
    hist_fig.update_layout(xaxis_title=col, yaxis_title='Count', height=400, width=600)
    pio.write_html(hist_fig, file=f'{save_dir}/histogram_{col}.html')

    box_violin_fig = go.Figure()
    box_violin_fig.add_trace(go.Box(y=DF[col], name='Box Plot', marker=dict(color='blue'), boxmean=True))
    box_violin_fig.add_trace(go.Violin(y=DF[col], name='Violin Plot', box_visible=True, meanline_visible=True, line_color='cornflowerblue'))
    box_violin_fig.update_layout(title=f'Box and Violin Plot of {col}', yaxis_title=col, height=400, width=600)
    pio.write_html(box_violin_fig, file=f'{save_dir}/box_violin_{col}.html')

    scatter_fig = go.Figure()
    scatter_fig.add_trace(go.Scatter(x=DF[col], y=DF[target], mode='markers', name='Data'))
    scatter_fig.update_layout(title=f'Scatter Plot of {col} vs {target}', height=400, width=600)
    pio.write_html(scatter_fig, file=f'{save_dir}/scatter_{col}_{target}.html')

    bar_fig = px.bar(DF, x=target, y=col, title=f'Bar Plot of {col} vs {target}', color=target, color_discrete_sequence=px.colors.qualitative.Bold)
    bar_fig.update_layout(height=400, width=600)
    pio.write_html(bar_fig, file=f'{save_dir}/bar_{col}_{target}.html')

    count_fig = px.histogram(DF, x=col, color=target, barmode='group', title=f'Count Plot of {col} by {target}')
    count_fig.update_layout(xaxis_title=col, yaxis_title='Count', height=400, width=600)
    pio.write_html(count_fig, file=f'{save_dir}/count_{col}_{target}.html')

    # Return lower and upper caps
    return lower_cap, upper_cap


In [18]:
lower_cap, upper_cap = EDA_num_save_plots(df, 'campaign', 'y', alpha_l=1, alpha_u=7)

Basic EDA:


Unnamed: 0,campaign
count,8393.0
mean,2.432742
std,2.411896
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,27.0


----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,count
campaign,Unnamed: 1_level_1
1,3651
2,2285
3,963
4,590
5,290
6,210
7,101
8,97
9,43
10,31


----------------------------------------------------------------------------------------------------
Skewness: 4.16
Kurtosis: 26.55
----------------------------------------------------------------------------------------------------
Number of Values: 27
----------------------------------------------------------------------------------------------------
Is the data distribution normal?: Sample does not look Gaussian (reject H0)
----------------------------------------------------------------------------------------------------
Histogram:


Box and Violin Plot:


Scatter Plot with Outliers:


Number of outliers in Class 0: 31
Number of outliers in Class 1: 4
Number of outliers: 35
----------------------------------------------------------------------------------------------------
Bar Plot:


Count Plot:


<iframe src="/content/plots/bar_campaign_y.html" width="600" height="400" frameborder="0"></iframe>

In [11]:
lower_cap, upper_cap = EDA_num(df, 'campaign', 'y', alpha_l=1, alpha_u=7)

Basic EDA:


Unnamed: 0,campaign
count,8393.0
mean,2.432742
std,2.411896
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,27.0


----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,count
campaign,Unnamed: 1_level_1
1,3651
2,2285
3,963
4,590
5,290
6,210
7,101
8,97
9,43
10,31


----------------------------------------------------------------------------------------------------
Skewness: 4.16
Kurtosis: 26.55
----------------------------------------------------------------------------------------------------
Number of Values: 27
----------------------------------------------------------------------------------------------------
Is the data distribution normal?: Sample does not look Gaussian (reject H0)
----------------------------------------------------------------------------------------------------
Histogram:


Box and Violin Plot:


Scatter Plot with Outliers:


Number of outliers in Class 0: 31
Number of outliers in Class 1: 4
Number of outliers: 35
----------------------------------------------------------------------------------------------------
Bar Plot:


Count Plot:


In [None]:
import plotly.offline as offline

fig = go.Figure(data=[go.Bar(y=[2, 3, 1])])
offline.plot(fig, filename='my_plot.html', auto_open=False)  # Save as HTML


In [13]:
# دریافت کد HTML نمودار
import plotly.io as pio
html_code = pio.to_html(hist_fig, full_html=False)

# ذخیره به صورت فایل
with open("histogram_embed.html", "w") as f:
    f.write(html_code)


NameError: name 'hist_fig' is not defined