In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import warnings
import json
import re
import matplotlib.patches as mpatches
from itertools import combinations
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
import plotly.graph_objects as go


### Univar Plots

In [None]:
def describe_numerical_col(df, col_name):
    info = df[[col_name]].describe().to_dict()[col_name]
    info['shapiro'] = f'{stats.shapiro(df[col_name])[1]: .5f}'
    info['normal'] = float(info['shapiro']) > 0.05
    info['missing'] = df[col_name].isna().sum()
    info['skew'] = f'{stats.skew(df[col_name]):.5f}'
    info['type'] = ('slight ' if info['normal'] else '') + \
                   ('right(positive)' if float(info['skew']) > 0 else 'left(negative)') + '-skew'

    fig, ax = plt.subplots(2, 2, figsize=(16, 7), gridspec_kw={'height_ratios': (.85, .15)})
    sns.histplot(df[col_name], kde=True, ax=ax[0, 0], color='#55A868')
    sns.boxplot(df[col_name], orient='h', ax=ax[1, 0], color="#5583A8")
    # محاسبه outliers
    q1 = df[col_name].quantile(0.25)
    q3 = df[col_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    lower_outliers = df[col_name][df[col_name] < lower_bound]
    upper_outliers = df[col_name][df[col_name] > upper_bound]
    total_outliers = len(lower_outliers) + len(upper_outliers)
    percent_outliers = 100 * total_outliers / len(df[col_name])
    label_text = (
        f"Lower Outliers: {len(lower_outliers)}\n"
        f"Upper Outliers: {len(upper_outliers)}\n"
        f"Total: {total_outliers} ({percent_outliers:.1f}%)"
    )
    patch = mpatches.Patch(color='skyblue', label=label_text)
    ax[1, 0].legend(handles=[patch], fontsize=12, loc='upper left', bbox_to_anchor=(1.05, 1))
    # پایان outliers
    counts, bin_edges = np.histogram(df[col_name], bins=10, density=True)
    pdf = counts / (sum(counts))
    cdf = np.cumsum(pdf)
    ax[1, 1] = plt.subplot(122)
    plt.plot(bin_edges[1:], pdf, label='PDF')
    plt.plot(bin_edges[1:], cdf, label='CDF')
    plt.legend()
    ax[0, 0].set_xticklabels([])
    ax[1, 0].set_yticklabels([])
    ax[0, 0].set_xlabel('')
    ax[0, 0].set_ylabel('Count')
    fig.suptitle(col_name, fontsize=30)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    info_df = pd.DataFrame.from_dict(info, orient='index', columns=[''])
    print('=' * 18 + ' ' + col_name + ' ' + '=' * 18)
    print(info_df)
    print('=' * 40)

In [None]:
def categorize_numerical_col(df, col_name, bins, bins_name):
    new_col = f'{col_name}_categorized'
    df[new_col] = pd.cut(df[col_name], bins=bins, labels=bins_name)
    return df

In [None]:
def describe_categorical_col(df, col_name):
    counts = pd.DataFrame(df[col_name].value_counts()).reset_index()
    counts.columns = ['Group', 'Count']
    total = sum(counts['Count'])
    counts['%'] = (counts['Count'] / total * 100).round(2)

    fig = px.pie(
        counts,
        names='Group',
        values='Count',
        title=f'<b>Distribution of {col_name} in each group</b>',
        color='Group',
        hole=0.3
    )

    fig.update_traces(
        textposition='inside',
        textinfo='percent+label',

    )

    fig.update_layout(
        title_x=0.5,
        legend_title_text='Groups',
        font=dict(family="Arial, sans-serif", size=14)
    )
    fig.show()
    print('=' * 18 + ' ' + col_name + ' ' + '=' * 18)
    print(counts)
    print('=' * 40)

### Bivar Plots

In [None]:
def describe_target_relationship(df, feature_col, target_col):
    if df[feature_col].dtype in ['object', 'category']:
        if df[target_col].dtype in ['object', 'category']:
            describe_cat_cat_relationship(df, feature_col, target_col)
        else:
            describe_cat_num_relationship(df, feature_col, target_col)
    else:
        if df[target_col].dtype in ['object', 'category']:
            describe_cat_num_relationship(df, target_col, feature_col)
        else:
            describe_num_num_relationship(df, feature_col, target_col)

In [None]:
def describe_cat_cat_relationship(df, col1, col2):
    # جدول تطبیقی
    contingency_table = pd.crosstab(df[col1], df[col2])
    normalized = pd.crosstab(df[col1], df[col2], normalize='index')

    # آزمون کای دو
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

    # ضریب کرامر V
    n = contingency_table.sum().sum()
    cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

    info = {
        'chi2_statistic': f'{chi2:.5f}',
        'p_value': f'{p_value:.5f}',
        'degrees_of_freedom': dof,
        'cramers_v': f'{cramers_v:.5f}',
        'association_strength': 'strong' if cramers_v > 0.3 else 'moderate' if cramers_v > 0.1 else 'weak',
        'significant_association': p_value < 0.05
    }

    # رسم نمودار
    fig, axs = plt.subplots(2, 2, figsize=(14, 10))

    # Heatmap of contingency table
    sns.heatmap(contingency_table, annot=True, cbar=False ,fmt='d', cmap=['White'], ax=axs[0, 0], linecolor='lightgray', linewidths=0.7)
    axs[0, 0].set_title(f'Contingency Table: {col1} vs {col2}')

    # Stacked bar chart 100%
    normalized.plot.bar(stacked=True, ax=axs[0, 1])
    axs[0, 1].set_title(f'Stacked Bar: {col1} vs {col2}')
    axs[0, 1].legend(title=col2, labels=['No', 'Yes'])
    plt.xticks(rotation=45)

    # Normalized heatmap (percentages)
    sns.heatmap(normalized, annot=True, fmt='.2%', cbar=False, cmap=['White'], ax=axs[1, 0], linecolor='lightgray', linewidths=0.7)
    axs[1, 0].set_title(f'Normalized Contingency Table')

    # Count plots side by side
    contingency_table.plot(kind='bar', ax=axs[1, 1])
    axs[1, 1].set_title(f'Count Comparison')
    axs[1, 1].legend(title=col2, labels=['No', 'Yes'])
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()


    print('=' * 15 + f' {col1} vs {col2} ' + '=' * 15)
    print("Contingency Table:")
    print(contingency_table)
    print("\nChi-square Test Results:")
    info_df = pd.DataFrame.from_dict(info, orient='index', columns=[''])
    print(info_df)
    print('=' * 50)


### Statistical test functions

In [None]:
def stattest_num_num_relationship(df, col1, col2):
    # آزمون نرمال بودن با شاپیرو
    shapiro1_stat, shapiro1_p = stats.shapiro(df[col1])
    shapiro2_stat, shapiro2_p = stats.shapiro(df[col2])

    # تشخیص نرمال بودن (p > 0.05 یعنی نرمال)
    is_normal1 = shapiro1_p > 0.05
    is_normal2 = shapiro2_p > 0.05
    both_normal = is_normal1 and is_normal2

    # انتخاب آزمون مناسب
    if both_normal:
        # استفاده از پیرسون
        corr_stat, p_value = stats.pearsonr(df[col1], df[col2])
        test_used = 'Pearson'
        correlation = corr_stat
    else:
        # استفاده از اسپیرمن
        corr_stat, p_value = stats.spearmanr(df[col1], df[col2])
        test_used = 'Spearman'
        correlation = corr_stat

    r2_score = correlation ** 2

    info = {
        'shapiro_p_col1': f'{shapiro1_p:.5f}',
        'shapiro_p_col2': f'{shapiro2_p:.5f}',
        'col1_normal': is_normal1,
        'col2_normal': is_normal2,
        'test_used': test_used,
        'correlation': f'{correlation:.5f}',
        'r_squared': f'{r2_score:.5f}',
        'p_value': f'{p_value:.5f}',
        'significant': p_value < 0.05,
        'relationship_strength': 'strong' if abs(correlation) > 0.7 else 'moderate' if abs(
            correlation) > 0.3 else 'weak',
        'relationship_direction': 'positive' if correlation > 0 else 'negative'
    }

    # رسم نمودار
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    fig.suptitle(f'{col1} vs {col2} - {test_used} Correlation', fontsize=14)

    # Scatter Plot
    axes[0].scatter(df[col1], df[col2],
                    s=60, alpha=0.7, color='blue', edgecolors='black', linewidth=0.5)
    axes[0].set_title('Scatter Plot')
    axes[0].set_xlabel(col1)
    axes[0].set_ylabel(col2)
    axes[0].grid(True, alpha=0.3)

    # Hexbin Plot
    axes[1].hexbin(df[col1], df[col2], gridsize=20, cmap='Blues', mincnt=1)
    axes[1].set_title('Hexbin Plot')
    axes[1].set_xlabel(col1)
    axes[1].set_ylabel(col2)
    axes[1].yaxis.set_label_position("right")

    plt.tight_layout()
    plt.show()

    info_df = pd.DataFrame.from_dict(info, orient='index', columns=[''])
    print('=' * 15 + f' {col1} vs {col2} ' + '=' * 15)
    print(info_df)
    print('=' * 50)

In [None]:
def stattest_cat_num_relationship(df, cat_col, num_col):
    # محاسبه آمار
    groups = df.groupby(cat_col)[num_col].describe()

    # آزمون نرمال بودن با شاپیرو برای هر گروه
    normality_results = {}
    group_data = []
    all_normal = True

    for name, group in df.groupby(cat_col):
        if len(group) >= 3:  # شاپیرو حداقل 3 داده نیاز داره
            shapiro_stat, shapiro_p = stats.shapiro(group[num_col])
            is_normal = shapiro_p > 0.05
            normality_results[name] = {
                'shapiro_p': f'{shapiro_p:.5f}',
                'is_normal': is_normal
            }
            if not is_normal:
                all_normal = False
        else:
            normality_results[name] = {
                'shapiro_p': 'N/A (too few data)',
                'is_normal': False
            }
            all_normal = False

        group_data.append(group[num_col].values)

    # انتخاب آزمون مناسب
    if all_normal:
        # استفاده از ANOVA معمولی
        f_stat, p_value = stats.f_oneway(*group_data)
        test_used = 'ANOVA'
    else:
        # استفاده از Welch's ANOVA
        try:
            from scipy.stats import alexandergovern
            # اگر alexandergovern موجود نباشه، از kruskal استفاده می‌کنیم
            result = alexandergovern(*group_data)
            f_stat, p_value = result.statistic, result.pvalue
            test_used = "Welch's ANOVA"
        except ImportError:
            # fallback به Kruskal-Wallis (non-parametric)
            f_stat, p_value = stats.kruskal(*group_data)
            test_used = 'Kruskal-Wallis (non-parametric)'

    info = {
        'test_used': test_used,
        'all_groups_normal': all_normal,
        'f_statistic': f'{f_stat:.5f}',
        'p_value': f'{p_value:.5f}',
        'significant_difference': p_value < 0.05,
        'num_categories': df[cat_col].nunique(),
        'total_observations': len(df)
    }

    # رسم نمودار
    fig = plt.subplots(2, 2, figsize=(14, 10))

    # Box plot
    ax1 = plt.subplot(221)
    sns.boxplot(data=df, x=cat_col, y=num_col)
    ax1.set_title(f'{num_col} by {cat_col}')
    plt.xticks(rotation=45)

    # Violin plot
    ax2 = plt.subplot(222)
    sns.violinplot(data=df, x=cat_col, y=num_col)
    ax2.set_title(f'{num_col} Distribution by {cat_col}')
    plt.xticks(rotation=45)

    # Bar plot of means
    ax3 = plt.subplot(223)
    means = df.groupby(cat_col)[num_col].mean().sort_values(ascending=False)
    sns.barplot(x=means.index, y=means.values)
    ax3.set_title(f'Mean {num_col} by {cat_col}')
    plt.xticks(rotation=45)

    # Count plot of categories
    ax4 = plt.subplot(224)
    sns.countplot(data=df, x=cat_col)
    ax4.set_title(f'Count of {cat_col}')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    print('=' * 15 + f' {cat_col} vs {num_col} ' + '=' * 15)
    print("Group Statistics:")
    print(groups)
    print(f"\nNormality Test Results (Shapiro-Wilk):")
    for group_name, result in normality_results.items():
        print(f"{group_name}: p-value = {result['shapiro_p']}, Normal = {result['is_normal']}")
    print(f"\n{test_used} Test Results:")
    info_df = pd.DataFrame.from_dict(info, orient='index', columns=[''])
    print(info_df)
    print('=' * 50)



In [None]:
def categorical_tests(test_type="vs-target", data=None, column=None, target=None, alpha=0.05):

    if test_type == "vs-target":
        con_table = pd.crosstab(data[column], data[target])
        stat, p_value, degree, _ = stats.chi2_contingency(con_table)
        result = {
            "categories vs target":test_type, "chi2_stat": stat, "p_value": p_value,
            "degrees_of_freedom": degree,
            "contingency_table": con_table.to_dict()
        }
        print('='*48)
        print(json.dumps(result, indent=3, sort_keys=False, default=str)) # using json.dumps for prettier printing
        print('='*40)

    elif test_type == "vs-categories":
        categories = df[column].unique()
        category_pairs = list(combinations(categories, 2))
        result = []
        for group1, group2 in category_pairs:
            subset = df[df[column].isin([group1, group2])]
            contingency_table = pd.crosstab(subset[column], subset[target])
            chi2, p_value, degree, _ = stats.chi2_contingency(contingency_table)

            if p_value < alpha:
                print(f"Significant difference between '{group1}' and '{group2}': \nChi2={chi2:.4f}, p-value={p_value:.4f}")
            else:
                print(f"No significant difference between '{group1}' and '{group2}': \nChi2={chi2:.4f}, p-value={p_value:.4f}")

### KPI Plots

In [None]:
def draw_heatmap_groups(df, group1, group2, kpi='Conversion Rate'):
    if kpi not in ['Conversion Rate', 'CPC']:
        raise ValueError('The KPI value should either be "Conversion Rate" or "CPC"')

    tmp = get_grouped_data(df, [group1, group2])

    heatmap_data = tmp.pivot_table(
        index=group1,
        columns=group2,
        values=kpi
    )
    heatmap_data.fillna(0, inplace=True)
    plt.figure(figsize=(18, 8))
    sns.heatmap(
        heatmap_data,
        cmap='YlGnBu',
        annot=True,
        linewidths=0.7,
        linecolor='lightgray'

    )
    plt.title(f'Heatmap of Conversion Rate by {group1}/{group2}', fontsize=18, weight='bold')
    plt.show()

In [None]:
def kpi_barplot(df, group_col, kpi = 'Conversion Rate'):

    if kpi not in ['Conversion Rate', 'CPC']:
        raise ValueError('The KPI value should either be "Conversion Rate" or "CPC"')

    tmp = get_grouped_data(df, group_col)

    fig = plt.figure(figsize=(12, 7))
    ax = sns.barplot(
        data=tmp,
        x=group_col,
        y=kpi,
        palette='viridis')
    ax.set_title(f'{kpi} by {group_col}')

    for p in ax.patches:
        string = f'{p.get_height():.2f}%' if kpi == 'Conversion Rate' else f'{p.get_height():.4f}$'
        ax.annotate(string,
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center',
                    fontsize=11, color='black',
                    xytext=(0, 10),
                    textcoords='offset points')
    plt.show()

