In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import geom
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import LogisticRegression as LogR
from sklearn.tree import DecisionTreeRegressor as DT
np.random.seed(12)

# Problem Statement

### We have a population of 20,000 people. We want to predict when they might have surgery.

## Challenges

Everyone is different. There are ten distinct risk factors (e.g, age, weight, blood pressure, cholesterol, etc.) No two people have the same combination of risk factors.

We don't know how much each factor increases (or decreases) probability of surgery.

No one has surgery exactly when expected.

We have very limited data about people.


In [None]:
features = 10
df = pd.DataFrame(data = np.random.random(size = (20000, features)))
w = np.random.random(features)-.5

In [None]:
intercept = -6

The probability of getting surgery (per day) is determined by the risk factors according to:

$$\Large p = \frac {1}{1+e^{-\left (x_{0}+\vec x \cdot \vec w\right )}}$$

Where $\vec x$ is the set of risk factors for each person and $\vec w$ is the relative hazard associated with each risk factor.

We don't know what $\vec w$ is. We want to be able to discover this from the data.

In [None]:
p = 1/(1+np.exp(-intercept-df.apply(lambda x:x.dot(w),axis=1)))

## Here are the risk factors for each person:

In [None]:
df

$N$ is the actual number of days until surgery for each person.
We don't get to know this becuase we would have to observe everyone for a very long time.

In [None]:
N = geom(p).rvs()

## We only have 180 days of data for each person

In [None]:
timeout = 180

In [None]:
plt.scatter(df[0], N, alpha=.1)
plt.axhline(timeout, color='red')
plt.xlabel('Risk Factor 0')
plt.ylabel('Days to Surgery (Observed and Unobserved)');

## Any surgery that happens after 180 days is not observed, so we mark it as NaN

In [None]:
df['observation'] = np.where(N<=timeout, N, np.nan)

## About 77% of our members do not have surgery within the observation window

In [None]:
df['observation'].isna().mean()

## Plots of the Number of days to surgery vs. the risk factors shows no discernable pattern

In [None]:
fig, axs = plt.subplots(features, figsize=(10,20))

for i, ax in enumerate(axs):
    ax.scatter(df[i],df['observation'], s=1)

## First, we identify the probability that a person has surgery within the observation window, based on their risk factors.

In [None]:
S = np.where(df['observation'].isna(),
                   LogR().fit(df.iloc[:,:features],df['observation'].isna())\
.predict_proba(df.iloc[:,:features])[:,0],
                   np.nan)

## Next, we create a place-holder value, an estimate of the number of days in the future when we think they might have surgery:

$$Λ = k + \frac {1}{1-\left (1-S \right )^{\frac {1}{k}}}  $$

Where $k$ is our timeout (180 days) and $S$ is the estimated probability that a surgery was observed for each member.

In [None]:
Λ = timeout + 1/(1-((1-S)**(1/timeout)))

In [None]:
df['x'] = np.where(df['observation'].isna(), Λ, df['observation'])
df

## We can now use linear regression to use these *anticipated* observations to estimate the averge time to surgery ($\hat x$) for any combination of risk factors.

In [None]:
df['x_pred'] = LR().fit(df.iloc[:,:features],
                        df['x']).predict(df.iloc[:,:features])

In [None]:
df['log_odds'] = LR().fit(df.iloc[:,:features],
np.log((1/df['x_pred'])/(1-(1/df['x_pred'])))).predict(df.iloc[:,:features])
df

## Let's see how our predicted probability, (derived from some really crummy data) corresponds to the ideal probability.

In [None]:
plt.scatter(p, 1/(1+1/np.exp(df['log_odds'])), alpha=.25,s=10)
plt.xticks(rotation=45)

plt.plot([p.min(), p.max()],[p.min(), p.max()],  color = 'red')

plt.xlabel('Ideal Probability')
plt.ylabel('Estimated Probability');

## Evaluation of Risk Factors

We can evaluate how much each feature contributes to a person's risk from the observed data by fitting an equation between the features and the log odds.

In [None]:
lr = LR().fit(df.iloc[:,:features], df['log_odds'])

We can compare these empirical values to the values used to create the dataset:

In [None]:
lr.intercept_, intercept

In [None]:
lr.coef_

In [None]:
w

In [None]:
plt.scatter(w, lr.coef_)
plt.plot([w.min(), w.max()], [w.min(),w.max()], color = 'red')
plt.xlabel('Actual Hazard Coeficients')
plt.ylabel('Measured Hazard Coeficients');

## Question:

### How many people in our data set *would* have surgery in 6-18 months?

In [None]:
((N>180)&(N<540)).sum()

### How many people were *observed* to have surgery in 6-18 months?

In [None]:
((df['observation']>180)&(df['observation']<540)).sum()

### How many people are *predicted* to have surgery in 6-18 months?

In [None]:
p_predicted = 1/(1+1/np.exp(df['log_odds']))
(geom(p_predicted).cdf(540)-geom(p_predicted).cdf(180)).sum()