In [None]:
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

## Introduction - What is Survival Analysis?
Example taken from:
https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html

In [None]:
#Load example data on the years of different regimes

from lifelines.datasets import load_dd
data = load_dd()
data.head()

In [None]:
#Select data for analysis
T = data["duration"]
E = data["observed"]

#Initiate model and fit model
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)

In [None]:
#Plot a Survival Function 
kmf.plot(figsize=(10,10))
plt.title('Survival function of political regimes');

##### What this graph is telling you
* x-axis: years in office
* y-axis: probability of a leader still around after x years in office
* For Example: _There's a 20% that a leader will be in office more than 8 years._

##### However, that not the whole story . . .
There are many different types of government, do we see differences?  Let's create another plot but this time filter out Democratic vs. Non-Democratic regimes.

In [None]:
#Survival analysis plots for Democratic vs. Non-Democratic regimes
ax = plt.subplot(111)

dem = (data["democracy"] == "Democracy") #filter for regimes

#Fit two different models
kmf.fit(T[dem], event_observed=E[dem], label="Democratic Regimes")
kmf.plot(ax=ax, figsize=(10,10))
kmf.fit(T[~dem], event_observed=E[~dem], label="Non-democratic Regimes")
kmf.plot(ax=ax)

#plot
plt.ylim(0, 1);
plt.title("Lifespans of different global regimes");

##### Now we can see differences that allow us to be more predictive

## Exercise - Survival Analysis of Days Drilling in the Mississippi Canyon Protracton, GOM.

The Mississippi Canyon Protraction Area in the Gulf of Mexico is one of the most prolific parts of the basin with some of its largest fields (Mars/Ursa, Thunderhorse).  Thousands of wells by different operators have been drilled here and likley many more.  A commone question

## Step 1.  Load Data and Generate Calculated Columns
We will be loading these data from an Excel sheet and then calculate the number of days drilled for each well based on its spude date and data at total depth.

In [None]:
#Load all well drilled in protraction area
df = pd.read_excel('../data/BoreholeMC.xlsx')
df.head()

In [None]:
#Calculate Date Difference, creating new column "drill_days"
df['drill_days']=df['Total Depth Date']-df['Spud Date']

#Convert Date Difference to Days
df['drill_days'] = df['drill_days']/np.timedelta64(1, 'D')

#Create new dataframe of only days drilled for simplicity
days = df[['drill_days', 'Type Code']].dropna()
days.head()

In [None]:
#Initiate model and fit model
kmf = KaplanMeierFitter()
kmf.fit(days.drill_days, event_observed=None)

In [None]:
#Plot a Survival Function 
kmf.plot(figsize=(10,10))
plt.title('Drilling Days Mississippi Canyon Protraciton Area');

## Step 2. Clean and Filter Data

The plot above is wrong, there's no way a well would have been drilling for 13 years! There must be some spurious data.  Let's investigate and clean.

In [None]:
#Use Describe to look at metrics for dataframe
days.describe().T

In [None]:
#Histogram of data to see where distribution sits, note log scale
plt.hist(days.drill_days, range=(0,5000), log=True);

Clearly there is something is askew as it is unlikely that an offshore well can be drilled in <7 days and no one plans for a well to take over a year to drill.  There are an array of methods to narrow these data down but tor practicality let's filter the data to wells that took between 7-150 days to drill.<br />  _If you would like to experiment with different numbers go ahead and update the code block below._

In [None]:
#Filter data to 7<x<365
days_filtered = days.query("drill_days<150 & drill_days>7")
days_filtered.describe().T

In [None]:
#Plot filtered Survival Function 
kmf.fit(days_filtered.drill_days, event_observed=None)
kmf.plot(figsize=(10,10));

##  Step 3. Breaking out the data.
The graph above makes more sense but just like in the intro we aren't taking account of the differences in the data.  One simple division we can make is to separate Exploration and Development wells.

In [None]:
#Filting original data on arbitary date range
df_filtered = df.query("drill_days<150 & drill_days>7")

#Create separate dataframes for Exploration and Development wells
expl_days = df_filtered['drill_days'][df_filtered["Type Code"] == "E"].dropna()
dev_days = df_filtered['drill_days'][df_filtered["Type Code"] != "E"].dropna()

In [None]:
#Survival plot for Exploration vs. Development
ax = plt.subplot(111)
kmf.fit(expl_days, event_observed=None, label="Exploration Wells")
kmf.plot(ax=ax, figsize=(10,10))
kmf.fit(dev_days, event_observed=None, label="Development Wells")
kmf.plot(ax=ax)
plt.ylim(0, 1);
plt.title("Drilling Days for Exploration vs. Development Wells");

This is more informative and it makes sense.  Development wells (orange) should take shorter to drill than Exploration wells (blue). 

## Step 4. Functions and Exploring the data

There's an addage that goes: __"If you've repeated a workflow, its time to write a function."__ Functions in Python allow us to simply our work by writing a block of code to run through it all.

In [None]:
#function to compare Exploration and Development wells for a particular company

def company_expl_dev_lifelines(df, company, mindays=5, maxdays=365):
    
    #Filter Data
    dn= df.loc[df['Company Name'].str.contains(company)]
    dq = dn.loc[(df['drill_days'] >= mindays) & (dn['drill_days'] <= maxdays)]
    de = dq['drill_days'][dq["Type Code"] == "E"].dropna()
    dd = dq['drill_days'][dq["Type Code"] != "E"].dropna()

    #Make Plot 
    ax = plt.subplot(111)
    kmf.fit(de, event_observed=None, label="Exploration Wells")
    kmf.plot(ax=ax, figsize=(10,10))
    kmf.fit(dd, event_observed=None, label="Development Wells")
    kmf.plot(ax=ax)

    plt.ylim(0, 1);
    plt.title(f"Drilling Days for {company} - Exploration vs. Development Wells");
    plt.show()

In [None]:
company_expl_dev_lifelines(df, 'Shell', maxdays=150)

Now its your turn to picks companies to plot.  To help you find names, below is a bar chart of the most prolific drillers in MC Protraction.  Note how the confidence intervals expand as there are fewer datapoints (i.e. Taylor Energy).

In [None]:
#Quick Plot of who's drilled the most in the protraction
comp_counts = df['Company Name'].value_counts()
comp_counts = comp_counts[comp_counts>50]
comp_counts.plot(kind='barh', figsize=(5,5), title='Top Operators in MC (>50 Wells)', label='# Wells');

In [None]:
company_expl_dev_lifelines(df, 'Taylor')

Another interesting comparison is to compare companies.

In [None]:
def company_compare_lifelines(df, company1, company2, mindays=5, maxdays=365):
    
    #Filter Data
    dd = df.loc[(df['drill_days'] >= mindays) & (df['drill_days'] <= maxdays)]
    dn= dd.loc[df['Company Name'].str.contains(company1)].dropna()
    do = dd.loc[df['Company Name'].str.contains(company2)].dropna()
    
    #Make Plot 
    ax = plt.subplot(111)
    kmf.fit(dn.drill_days, event_observed=None, label=company1)
    kmf.plot(ax=ax, figsize=(10,10))
    kmf.fit(do.drill_days, event_observed=None, label=company2)
    kmf.plot(ax=ax)

    plt.ylim(0, 1);
    plt.title(f"Drilling Days for {company1} vs. {company2}");
    plt.show()

In [None]:
company_compare_lifelines(df, 'Shell', 'LLOG')