In [None]:
#!/usr/bin/env python3
"""
analysers.py

A module providing object‑oriented classes for statistical analysis:
 - Correlation (Pearson, Spearman)
 - Mendelian Randomization (2‑stage least squares)
 - Causality tests (Conditional Mutual Information, Transfer Entropy, Granger Causality)

Usage (from shell):
    python src/analysers.py --help

Or import in a notebook:
    from src.analysers import CorrelationAnalyser, RandomizationAnalyser, CausalityAnalyser
"""

import numpy as np                                  # Numerical operations
import pandas as pd                                 # DataFrame handling
from sklearn.preprocessing import KBinsDiscretizer   # Discretize continuous data
from statsmodels.tsa.stattools import grangercausalitytests  # Granger causality tests


class BaseAnalyser:
    """
    Base class for all analysers.
    Holds a pandas DataFrame and provides common functionality.
    """
    def __init__(self, df: pd.DataFrame):
        """
        Initialize the analyser with a DataFrame.

        :param df: pandas DataFrame containing the data to analyse.
        """
        self.df = df  # Store the DataFrame for later use


class CorrelationAnalyser(BaseAnalyser):
    """
    Analyser for computing correlations between two variables in the DataFrame.
    Inherits from BaseAnalyser.
    """
    def pearson(self, x: str, y: str) -> float:
        """
        Compute Pearson correlation coefficient between columns x and y.

        :param x: name of the first numeric column
        :param y: name of the second numeric column
        :return: Pearson r (float)
        """
        # Select the two columns and compute the correlation matrix,
        # then extract the off-diagonal element at (0,1)
        return self.df[[x, y]].corr(method='pearson').iloc[0, 1]

    def spearman(self, x: str, y: str) -> float:
        """
        Compute Spearman rank correlation between columns x and y.

        :param x: name of the first column
        :param y: name of the second column
        :return: Spearman rho (float)
        """
        return self.df[[x, y]].corr(method='spearman').iloc[0, 1]


class RandomizationAnalyser(BaseAnalyser):
    """
    Analyser for Mendelian (instrumental-variable) randomization.
    Implements a two-stage least squares procedure.
    """
    def mendelian_randomization(self, exposure: str, outcome: str, instrument: str):
        """
        Perform two-stage least squares:
         1) Regress exposure on instrument
         2) Regress outcome on predicted exposure from stage 1

        :param exposure: name of the exposure column
        :param outcome: name of the outcome column
        :param instrument: name of the genetic instrument column
        :return: statsmodels RegressionResults of stage‑2 regression
        """
        import statsmodels.api as sm

        # Drop rows with missing data in any of the three columns
        data = self.df.dropna(subset=[exposure, outcome, instrument])

        # Stage 1: fit exposure ~ instrument + intercept
        inst = sm.add_constant(data[instrument])       # add constant term
        model1 = sm.OLS(data[exposure], inst).fit()    # OLS regression
        exp_hat = model1.predict(inst)                 # predicted exposure

        # Stage 2: fit outcome ~ predicted exposure + intercept
        inst2 = sm.add_constant(exp_hat)               
        model2 = sm.OLS(data[outcome], inst2).fit()
        return model2  # return fitted model object


class CausalityAnalyser(BaseAnalyser):
    """
    Analyser for various causality metrics:
     - Conditional Mutual Information (CMI)
     - Transfer Entropy (TE)
     - Granger Causality (GC)
    """
    def conditional_mutual_information(self, x: str, y: str, z: str, n_bins: int = 10) -> float:
        """
        Estimate I(X; Y | Z) by discretizing X, Y, Z into bins.

        :param x: name of variable X
        :param y: name of variable Y
        :param z: name of conditioning variable Z
        :param n_bins: number of bins for discretization
        :return: estimated conditional mutual information
        """
        # Select and drop rows with missing values
        data = self.df[[x, y, z]].dropna()

        # Discretize each variable into integer bins [0..n_bins-1]
        disc = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
        Xd, Yd, Zd = disc.fit_transform(data).astype(int).T
        n = len(Xd)

        # Count joint and marginal frequencies
        from collections import Counter
        p_xyz = Counter(zip(Xd, Yd, Zd))
        p_xz  = Counter(zip(Xd, Zd))
        p_yz  = Counter(zip(Yd, Zd))
        p_z   = Counter(Zd)

        # Compute CMI sum_{x,y,z} p(x,y,z) * log( (p(x,y,z)*p(z)) / (p(x,z)*p(y,z)) )
        cmi = 0.0
        for (xi, yi, zi), count in p_xyz.items():
            p_xyz_val = count / n
            p_xz_val  = p_xz[(xi, zi)] / n
            p_yz_val  = p_yz[(yi, zi)] / n
            p_z_val   = p_z[zi] / n
            cmi += p_xyz_val * np.log((p_xyz_val * p_z_val) / (p_xz_val * p_yz_val) + 1e-12)
        return cmi

    def transfer_entropy(self, source: str, target: str, lag: int = 1, n_bins: int = 10) -> float:
        """
        Estimate Transfer Entropy TE(source→target) ≈ I(source_{t-lag}; target_t | target_{t-lag})

        :param source: name of source time series
        :param target: name of target time series
        :param lag: lag order
        :param n_bins: number of bins for discretization
        :return: estimated transfer entropy
        """
        # Prepare lagged variables
        df = self.df[[source, target]].dropna()
        df['target_lag'] = df[target].shift(lag)
        df['source_lag'] = df[source].shift(lag)
        df = df.dropna()

        # Compute conditional mutual information for TE
        return self.conditional_mutual_information('source_lag', target, 'target_lag', n_bins=n_bins)

    def granger_causality(self, source: str, target: str, maxlag: int = 1, **kwargs):
        """
        Perform Granger causality test: does `source` help predict `target`?

        :param source: name of source series
        :param target: name of target series
        :param maxlag: maximum lag to test
        :return: dictionary of test results per lag
        """
        data = self.df[[target, source]].dropna()
        # Format: array [[target, source], ...]
        arr = data.values
        results = grangercausalitytests(arr, maxlag=maxlag, verbose=False)
        return results


if __name__ == "__main__":
    # Example CLI: python src/analysers.py --input data.csv --mode pearson --x col1 --y col2
    import argparse

    parser = argparse.ArgumentParser(description="Run statistical analysers on a CSV file")
    parser.add_argument("--input", "-i", required=True,
                        help="Path to input CSV file")
    parser.add_argument("--mode", "-m", required=True,
                        choices=["pearson", "spearman", "mr", "cmi", "te", "gc"],
                        help="Analysis mode")
    parser.add_argument("--x", help="Column X (for correlation, CMI, TE, GC)")
    parser.add_argument("--y", help="Column Y (for correlation, CMI, TE, GC)")
    parser.add_argument("--z", help="Column Z (for CMI)")
    parser.add_argument("--instrument", help="Instrument column (for MR)")
    parser.add_argument("--exposure", help="Exposure column (for MR)")
    parser.add_argument("--outcome", help="Outcome column (for MR)")
    parser.add_argument("--lag", type=int, default=1, help="Lag for TE/GC")
    parser.add_argument("--bins", type=int, default=10, help="Bins for discretization")
    args = parser.parse_args()

    # Load data
    df = pd.read_csv(args.input)
    if args.mode in ["pearson", "spearman"]:
        corr = CorrelationAnalyser(df)
        func = corr.pearson if args.mode == "pearson" else corr.spearman
        print(f"{args.mode}({args.x}, {args.y}) =", func(args.x, args.y))

    elif args.mode == "mr":
        rnd = RandomizationAnalyser(df)
        model = rnd.mendelian_randomization(args.exposure, args.outcome, args.instrument)
        print(model.summary())

    elif args.mode == "cmi":
        caus = CausalityAnalyser(df)
        print("CMI:", caus.conditional_mutual_information(args.x, args.y, args.z, n_bins=args.bins))

    elif args.mode == "te":
        caus = CausalityAnalyser(df)
        print("TE:", caus.transfer_entropy(args.x, args.y, lag=args.lag, n_bins=args.bins))

    elif args.mode == "gc":
        caus = CausalityAnalyser(df)
        res = caus.granger_causality(args.x, args.y, maxlag=args.lag)
        print("Granger Causality results:", res)



# :
# df = pd.read_csv('your_data.csv')
# corr = CorrelationAnalyser(df)
# print("Pearson r:", corr.pearson('X', 'Y'))
# rnd = RandomizationAnalyser(df)
# mr_model = rnd.mendelian_randomization('exposure', 'outcome', 'instrument')
# print(mr_model.summary())
# caus = CausalityAnalyser(df)
# print("Conditional MI:", caus.conditional_mutual_information('X','Y','Z'))
# print("Transfer Entropy:", caus.transfer_entropy('X','Y'))
# print("Granger Causality:", caus.granger_causality('X','Y', maxlag=3))

from src.analysers import (
    EDAAnalyser, FeatureEngineer, ModelTrainer,
    CorrelationAnalyser, RandomizationAnalyser, CausalityAnalyser
)

# 1) EDA
eda = EDAAnalyser(df_sample)
print(eda.summary())
print(eda.missing_summary())

# 2) Features
fe = FeatureEngineer(df_sample)
X, y = fe.get_features_and_target('outcome')
X = fe.one_hot_encode(X, ['instrument'])
X = fe.scale_numeric(X, ['exposure'])
X_train, X_test, y_train, y_test = fe.train_test_split(X, y)

# 3) Modeling
from sklearn.ensemble import RandomForestClassifier
mt = ModelTrainer(RandomForestClassifier(random_state=42),
                  param_grid={'n_estimators': [50,100], 'max_depth': [3,5]})
print("CV scores:", mt.cross_validate(X_train, y_train))
grid = mt.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Test eval:", mt.evaluate(X_test, y_test))

# 4) Stats
corr = CorrelationAnalyser(df_sample)
print("Pearson X/Y:", corr.pearson('X','Y'))
