In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings
from scipy.stats import chi2_contingency

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

In [23]:
class Analyzer:
    def __init__(self):
        self.A = None
        self.B = None
        self.testtype = None
        self.result = None


    def fit(self, a, b, column_a=None, column_b=None):
        """
            This method assigns the data after fixing the format and check for their lengths.
        """
        format_checked_a = self.format_prepare(a, column_a)
        format_checked_b = self.format_prepare(b, column_b)

        if len(format_checked_a) != len(format_checked_b):
            raise ValueError("x and y must have the same length.")
        self.A = format_checked_a
        self.B = format_checked_b

    def summary_check(self):
        return {
            "x_shape": self.A.shape,
            "y_shape": self.B.shape,
            "x_dtype": self.A.dtype,
            "y_dtype": self.B.dtype
        }


    def format_prepare(self, data, column=None):
        """
            This method should check for the data to be of right format.
            For example if the data is a DataFrame, it should change the data to Series or numpy array.
        """
        if isinstance(data, np.ndarray):
            if data.ndim > 1:
                return data.flatten()
            return data

        elif isinstance(data, pd.Series):
            return data.values

        elif isinstance(data, pd.DataFrame):
            if column is None:
                if data.shape[1] == 1:
                    return data.iloc[:, 0].values
                else:
                    raise ValueError("DataFrame has multiple columns. Please specify 'column'.")
            else:
                if column not in data.columns:
                    raise ValueError(f"Column '{column}' not found in DataFrame.")
                return data[column].values
        elif isinstance(data, list):
            return np.array(data)

        else:
            raise TypeError("Unsupported data type. Use list, numpy array, pandas Series, or DataFrame.")

    def analyze(self):
        """"
            This method is the analysis starter. it has to check the data for normality and
            also should distinguish between independent and paired data.
            after all the necessary operation before tests, __do_test is called to do the test.
        """
        if self.A is None or self.B is None:
            raise Exception("A and B are empty. You must specify data group a and b with object.fit(a, b).")

        normal_a = stats.shapiro(self.A)
        normal_b = stats.shapiro(self.B)
        normal = normal_a & normal_b
        if normal:
            self.__initial_test('t-test')
        else:
            self.__initial_test('mann-whitney-u')
        #TODO: Check for independant or paired (usually user should specify that).


    def __initial_test(self, test):
        if test == 't-test':
            self.result = stats.ttest_ind(self.A, self.B)
        elif test == 'mann-whitney-u':
            self.result = stats.mannwhitneyu(self.A, self.B)

    def interpret(self):
        if self.A is None or self.B is None:
            raise Exception('A and B are empty. You must specify data group a and b with object.fit(a, b).')
        score, p_val = self.result
        if p_val < 0.05:
            print('Rejecting H0 hypothesis')
        else:
            print('Cannot reject H0.')

    def __plot_single_numeric(data, name):
        fig, ax = plt.subplots(2, 2, figsize=(16, 7), gridspec_kw={'height_ratios':(.85, .15)})
        sns.histplot(data, kde=True, ax=ax[0, 0], color='#55A868')
        sns.boxplot(data, orient='h', ax=ax[1, 0], color="#5583A8")
        counts, bin_edges = np.histogram(data, bins=10, density = True)
        pdf = counts / (sum(counts))
        cdf = np.cumsum(pdf)
        ax[1, 1] = plt.subplot(122)
        plt.plot(bin_edges[1:], pdf, label='PDF')
        plt.plot(bin_edges[1:], cdf, label='CDF')
        plt.legend()
        ax[0, 0].set_xticklabels([])
        ax[1, 0].set_yticklabels([])
        ax[0, 0].set_xlabel('')
        ax[0, 0].set_ylabel('Count')
        fig.suptitle(name, fontsize=30)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    def __plot_groups_numerical(self):
        fig, ax = plt.subplots(2, 2, figsize=(15, 10))
        #TODO: the structure of data must be specified for this function.
        # # First plot: A histogram of two groups
        # sns.histplot(data=[self.A, self.B]],
        #             x=num_col, hue=target_col,
        #             kde=True,
        #             ax=ax[0, 0],
        #             element='step',
        #             palette='Pastel1',
        #             alpha=0.7)
        # ax[0, 0].set_xlabel('')
        # legend = ax[0, 0].get_legend()
        # handles = legend.legend_handles
        # ax[0, 0].legend(handles, ['No', 'Yes'], title=target_col)

        # # Second plot: Box plots of two Groups
        # sns.boxplot(data = df, x=target_col, y=num_col, palette='Pastel1', ax=ax[0, 1])
        # ax[0, 1].set_xticklabels([])
        # ax[0, 1].set_xlabel('')
        # # Third plot: Violen plot
        # sns.violinplot(data=df, x=target_col, y=num_col, palette='Pastel1', ax=ax[1, 1])
        # ax[1, 1].set_xticks([0, 1], ['No', 'Yes'])
        # # Forth plot: Q-Q plot
        # stats.probplot(df[num_col], plot=ax[1, 0])
        # fig.tight_layout()
        # plt.show()

    def visual_description(self):
        """
        This method draws related plots. it can be used alongside interpret for better understanding or
        it can be used seperately to give a visual on two groups of data.
        """
        if self.A is None or self.B is None:
            raise Exception('A and B are empty. You must specify data group a and b with object.fit(a, b).')



In [34]:
x = [22, 23, 24, 25]
df = df = pd.DataFrame({
    "age": [20, 22, 25, 30],
    "score": [70, 80, 90, 85]
})

stat_ob = Analyzer()

In [36]:
stat_ob.fit(x, df["score"])

In [37]:
stat_ob.summary_check()

{'x_shape': (4,),
 'y_shape': (4,),
 'x_dtype': dtype('int64'),
 'y_dtype': dtype('int64')}