In [1]:
import numpy as np
import pandas as pd

from fklearn.causal.effects import linear_effect
from fklearn.causal.validation.curves import cumulative_effect_curve

In [2]:
from typing import Any, List

from toolz import curry
from fklearn.types import EffectFnType

In [3]:
df = pd.DataFrame(dict(
    t=[1, 1, 1, 2, 2, 2, 3, 3, 3],
    x=[1, 2, 3, 1, 2, 3, 1, 2, 3],
    y=[1, 1, 1, 2, 3, 4, 3, 5, 7],
))

result = cumulative_effect_curve(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0],
                                 effect_fn=linear_effect)

In [4]:
result

array([3.        , 3.        , 2.92857143, 2.5       , 2.5       ,
       2.46153846, 2.        ])

In [9]:
# def linear_ci(df, y, t, z=1.96):
#     n = df.shape[0]
#     t_bar = df[t].mean()
#     beta1 = linear_effect(df, y, t)
#     beta0 = df[y].mean() - beta1 * t_bar
#     e = df[y] - (beta0 + beta1*df[t])
#     se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))
#     return np.array([beta1 - z*se, beta1 + z*se])

def linear_ci(df, t, y):
    n = df.shape[0]
    t_bar = df[t].mean()
    beta1 = linear_effect(df, t, y)
    beta0 = df[y].mean() - beta1 * t_bar
    e = df[y] - (beta0 + beta1*df[t])
    se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))
    return se

In [10]:
linear_ci(ordered_df.head(3), "t", "y")

0.0

In [7]:
ordered_df = df.sort_values(by="x", ascending=False)
ordered_df.head(3)

Unnamed: 0,t,x,y
2,1,3,1
5,2,3,4
8,3,3,7


In [11]:

def confidence_interval_curve(
    df: pd.DataFrame,
    treatment: str,
    outcome: str,
    prediction: str,
    min_rows: int = 30,
    steps: int = 100,
    error_fn: EffectFnType = linear_standard_error,
    **kwargs,
) -> np.ndarray:
    
    size = df.shape[0]
    ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)
    n_rows = list(range(min_rows, size, size // steps)) + [size]
    
    return np.array([error_fn(ordered_df.head(rows), treatment, outcome, **kwargs) for rows in n_rows])

In [19]:
@curry
def effect_curves(
    df: pd.DataFrame,
    treatment: str,
    outcome: str,
    prediction: str,
    min_rows: int = 30,
    steps: int = 100,
    effect_fn: EffectFnType = linear_effect,
    error_fn: EffectFnType = None,
    **kwargs,
) -> pd.DataFrame:

    size: int = df.shape[0]
    n_rows: List[int] = list(range(min_rows, size, size // steps)) + [size]

    cum_effect: np.ndarray = cumulative_effect_curve(
        df=df,
        treatment=treatment,
        outcome=outcome,
        prediction=prediction,
        min_rows=min_rows,
        steps=steps,
        effect_fn=effect_fn,
    )
    ate: float = cum_effect[-1]
    
    effect_curves = pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign(
        samples_fraction=lambda x: x["samples_count"] / size,
        cumulative_gain_curve=lambda x: x["samples_fraction"] * x["cumulative_effect_curve"],
        random_model_cumulative_gain_curve=lambda x: x["samples_fraction"] * ate,
        relative_cumulative_gain_curve=lambda x: (
            x["samples_fraction"] * x["cumulative_effect_curve"] - x["random_model_cumulative_gain_curve"]
        ),
    )
    
    if ci_fn is not None:
        
        effect_errors: np.ndarray = confidence_interval_curve(
            df=df,
            treatment=treatment,
            outcome=outcome,
            prediction=prediction,
            min_rows=min_rows,
            steps=steps,
            error_fn=error_fn,
            **kwargs,
        )
        
        effect_curves = effect_curves.assign(
            cumulative_effect_curve_error=effect_errors,
            cumulative_gain_curve_error=lambda x: x["samples_fraction"] * x["cumulative_effect_curve_error"],
        )

    return effect_curves


In [13]:
df = pd.DataFrame(dict(
    t=[1, 1, 1, 2, 2, 2, 3, 3, 3],
    x=[1, 2, 3, 1, 2, 3, 1, 2, 3],
    y=[1, 1, 1, 2, 3, 4, 3, 5, 7],
))

In [24]:
effect_curves(
    df=df,
    treatment="t",
    outcome="y",
    prediction="x",
    min_rows = 3,
    steps = df.shape[0],
    effect_fn = linear_effect,
    ci_fn = linear_ci,
    z = 1.96
)

Unnamed: 0,samples_count,cumulative_effect_curve,samples_fraction,cumulative_gain_curve,random_model_cumulative_gain_curve,relative_cumulative_gain_curve,cumulative_effect_curve_error,cumulative_gain_curve_error
0,3,3.0,0.333333,1.0,0.666667,0.333333,0.0,0.0
1,4,3.0,0.444444,1.333333,0.888889,0.444444,0.0,0.0
2,5,2.928571,0.555556,1.626984,1.111111,0.515873,0.599444,0.333025
3,6,2.5,0.666667,1.666667,1.333333,0.333333,0.774758,0.516505
4,7,2.5,0.777778,1.944444,1.555556,0.388889,0.628855,0.48911
5,8,2.461538,0.888889,2.188034,1.777778,0.410256,0.765483,0.680429
6,9,2.0,1.0,2.0,2.0,0.0,0.956382,0.956382


In [64]:
df.sort_values(by="x", ascending=True)

Unnamed: 0,t,x,y
0,1,1,1
3,2,1,2
6,3,1,3
1,1,2,1
4,2,2,3
7,3,2,5
2,1,3,1
5,2,3,4
8,3,3,7


In [16]:
cumulative_elast_curve_ci(
    dataset=df,
    prediction="x",
    y="y",
    t="t",
    min_periods=3,
    steps=df.shape[0]
)

array([0.        , 0.        , 0.59944419, 0.77475803, 0.62885517,
       0.76548284, 0.95638207])

In [17]:
linear_effect(df, "y", "t")

0.35294117647058826

In [14]:
def elast_ci(df, y, t, z=1.96):
    n = df.shape[0]
    t_bar = df[t].mean()
#     beta1 = elast(df, y, t)
    beta1 = linear_effect(df, t, y)
    beta0 = df[y].mean() - beta1 * t_bar
    e = df[y] - (beta0 + beta1*df[t])
    se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))
#     return np.array([beta1 - z*se, beta1 + z*se])
    return z*se

@curry
def elast(data, y, t):
        # line coeficient for the one variable linear regression
        return (np.sum((data[t] - data[t].mean())*(data[y] - data[y].mean())) /
                np.sum((data[t] - data[t].mean())**2))

In [18]:
def cumulative_elast_curve_ci(dataset, prediction, y, t, min_periods=30, steps=100):
    size = dataset.shape[0]
    ordered_df = dataset.sort_values(prediction, ascending=False).reset_index(drop=True)
    n_rows = list(range(min_periods, size, size // steps)) + [size]
    
    # just replacing a call to `elast` by a call to `elast_ci`
    return np.array([elast_ci(ordered_df.head(rows), y, t)  for rows in n_rows])

In [None]:
# siguientes tasks:
# - Agregar nuevos argumentos a las funciones y documentación de las funciones
# - Modificar cumulative effect curves con nuevos cambios
# - Crear archivo aparte de "confidence intervals/errors" (effects.py) y ahí poner linear_ci
# - Crear archivo aparte de "curves" (curves.py) y ahí poner confidence_interval_curve
# - Crear un nuevo tipo de variable (?, ErrorFnType) con el mismo signature de EffectFnType
# - Agregar tests con los ejemplos de este notebook
# - Modificar índice de la documentación
# - Abrir PR
#   - agregar comentarios:
#     1. discusión de los tipos de las funciones: tienen el mismo signature, podemos pensar en algo más genérico?
#     2. confidence_interval_curve y cumulative_effect_curve hacen lo mismo, podemos pensar en algo más genérico?
#     3. cómo atacar la vulnerabilidad de que las curvas y los errores son arreglos que tienen que ser del mismo tamaño.
#        Ahorita lo estamos hard-coding pero existirá alguna mejor manera de lidiar con esto?