In [1]:
# Import 3rd party libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read the drill data and select the required coloumns
drill = pd.read_csv('drill_data.csv')
drill = drill[['UWI', 'WA NUM', 'Td Depth (m)', 'Spud Date', 'Rig Rels Date']]
drill.head()

Unnamed: 0,UWI,WA NUM,Td Depth (m),Spud Date,Rig Rels Date
0,200D080H092G0300,1,568.5,19480709,19481211.0
1,200B050D082G0100,2,1097.3,19490809,19500726.0
2,200D055A082G0200,3,533.4,19480709,19490725.0
3,100131108117W600,4,1816.2,19490807,19521223.0
4,200A065E093I1600,5,2780.1,19491117,19500813.0


In [3]:
#Filter the data the is only between 2019 and 2021 in ascending order
drill_date = drill.loc[(drill['Spud Date'] >= 20190101) & (drill['Spud Date'] <= 20211231)]
drill_date_sorted = drill_date.sort_values(by='Spud Date', ascending=True)

In [4]:
#Check for empty cells
drill_date_sorted.isna().sum()

UWI              0
WA NUM           0
Td Depth (m)     0
Spud Date        0
Rig Rels Date    4
dtype: int64

In [5]:
# Add NaN in empty cells to remove
drill_date_sorted['Td Depth (m)'].replace(' ', np.nan, inplace=True)
drill_date_sorted

Unnamed: 0,UWI,WA NUM,Td Depth (m),Spud Date,Rig Rels Date
29201,200D021C094G0902,35478,4425,20190101,20190126.0
29200,200A054C094G0900,35478,1875,20190101,20190126.0
28770,202D088G094B0900,33784,4367,20190103,20190121.0
28117,200A054D094H0500,31830,4319,20190103,20190218.0
29197,100053508324W600,35456,3933,20190104,20190122.0
...,...,...,...,...,...
30223,103091507818W600,41828,5504,20211221,20220109.0
29128,202A088I094A1300,35130,,20211223,20220131.0
29127,200A088I094A1300,35129,,20211224,20220125.0
29126,200C087I094A1300,35128,,20211225,20220116.0


In [6]:
#Drop all empty cells
drill_date_sorted.dropna(subset=['Td Depth (m)'], inplace=True)
drill_date_sorted

Unnamed: 0,UWI,WA NUM,Td Depth (m),Spud Date,Rig Rels Date
29201,200D021C094G0902,35478,4425,20190101,20190126.0
29200,200A054C094G0900,35478,1875,20190101,20190126.0
28770,202D088G094B0900,33784,4367,20190103,20190121.0
28117,200A054D094H0500,31830,4319,20190103,20190218.0
29197,100053508324W600,35456,3933,20190104,20190122.0
...,...,...,...,...,...
30229,203B084A094B0900,41928,4495,20211217,20220110.0
30230,202B084A094B0900,41929,4660,20211218,20220101.0
30218,100012207818W600,41823,5415,20211220,20220105.0
30223,103091507818W600,41828,5504,20211221,20220109.0


In [7]:
drill_date_sorted.isna().sum()

UWI              0
WA NUM           0
Td Depth (m)     0
Spud Date        0
Rig Rels Date    0
dtype: int64

In [28]:
#Check the types in the dataframe
drill_date_sorted.dtypes

UWI              object
WA NUM            int64
Td Depth (m)     object
Spud Date         int64
Rig Rels Date     int64
dtype: object

In [29]:
#Convert Rig Date to int and Td Depth to fload
drill_date_sorted['Rig Rels Date']= drill_date_sorted['Rig Rels Date'].astype('int64')
drill_date_sorted['Td Depth (m)']= drill_date_sorted['Td Depth (m)'].astype('float')
drill_date_sorted

Unnamed: 0,UWI,WA NUM,Td Depth (m),Spud Date,Rig Rels Date
29201,200D021C094G0902,35478,4425.0,20190101,20190126
29200,200A054C094G0900,35478,1875.0,20190101,20190126
28770,202D088G094B0900,33784,4367.0,20190103,20190121
28117,200A054D094H0500,31830,4319.0,20190103,20190218
29197,100053508324W600,35456,3933.0,20190104,20190122
...,...,...,...,...,...
30229,203B084A094B0900,41928,4495.0,20211217,20220110
30230,202B084A094B0900,41929,4660.0,20211218,20220101
30218,100012207818W600,41823,5415.0,20211220,20220105
30223,103091507818W600,41828,5504.0,20211221,20220109


In [31]:
drill_date_sorted.describe()

Unnamed: 0,WA NUM,Td Depth (m),Spud Date,Rig Rels Date
count,1195.0,1195.0,1195.0,1195.0
mean,37079.343096,4602.647197,20201240.0,20202080.0
std,3048.984743,926.623402,8282.623,8277.855
min,27147.0,607.0,20190100.0,20190120.0
25%,34968.5,4090.0,20191020.0,20191120.0
50%,37343.0,4629.0,20200820.0,20201010.0
75%,39367.0,5162.0,20210410.0,20210510.0
max,42395.0,7168.0,20211230.0,20220110.0
