#### Libraries

In [1]:
# Only install the following libraries if you dont have it, otherwise leave it commented out

#!conda install -c anaconda natsort --yes
#!conda install -c anaconda xlrd --yes

#!pip install natsort --user
#!pip install xlrd --user
#!pip install pycaret[full] --user
#!pip install mlflow --user
#!pip install tune-sklearn ray[tune] --user
#!pip install optuna -- user
#!pip install hyperopt --user
#!pip install redis --user

# General Libraries
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
import time
import re
import requests
import pickle
import seaborn as sns
import os
import glob
import sys
from natsort import natsorted
sns.set()

import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Sklearn Liraries
from sklearn import preprocessing

import datetime
from datetime import timedelta, date 
start = time.time()
%matplotlib inline

import ray
from ray import tune

# Forces the print statement to show everything and not truncate
# np.set_printoptions(threshold=sys.maxsize) 
print('Libraries imported')

Libraries imported


In [2]:
#Receive Data
#dir_name = r'C:\Users\kswaminathan\OneDrive\01_KannaLibrary\15_Analogs'
dir_name = r'C:\Users\mkumar\Documents\GitHub\@Papers\SPE2022\Final\1_TORIS_MODEL'
filename_suffix = 'csv'

In [3]:
skiprows = 0
#Means read in the ',' as thousand seperator. Also drops all columns which are unnamed.
df = pd.read_excel("dfssoil.xlsx", thousands=',', skiprows = skiprows)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] 
df.head()

Unnamed: 0,Lithology Code,Well Spacing,Net Pay Pay,Gross Pay,Porosity,Swi,Oil FVFi,Temp,Permeability,API Gravity,...,Fractured Faulted,Shale Breaks,Major Gas Cap,Geologic Play,Deposition System,Diagenetic Overprint,Structural Comp,Heterogeniety,Trap Type,URF
0,1,120.0,320.0,600.0,14.0,40.0,1.586,153,10.0,41.0,...,0,0,0,303,132,1,10,1,2,0.352836
1,1,160.0,100.0,160.0,18.1,35.0,1.23,163,65.0,35.6,...,0,0,0,303,132,1,10,1,2,0.32157
2,1,160.0,100.0,300.0,15.7,51.6333,1.19,185,102.0,32.9,...,0,1,0,303,132,1,10,1,2,0.277514
3,1,80.0,350.0,1400.0,13.0,30.0,1.23,153,3.0,35.0,...,1,1,0,303,131,1,10,1,2,0.38
4,1,160.0,194.0,550.0,23.0,35.0,1.38,200,450.0,27.0,...,1,1,1,302,131,7,10,1,3,0.425909


In [4]:
print(df.columns.values)

['Lithology Code' 'Well Spacing' 'Net Pay Pay' 'Gross Pay' 'Porosity'
 'Swi' 'Oil FVFi' 'Temp' 'Permeability' 'API Gravity' 'Viscosity' 'OOIP'
 'Initial GOR' 'Pressure Initial' 'Fractured Faulted' 'Shale Breaks'
 'Major Gas Cap' 'Geologic Play' 'Deposition System'
 'Diagenetic Overprint' 'Structural Comp' 'Heterogeniety' 'Trap Type'
 'URF']


### Feature Selection

In [6]:
#Create a copy
df_train_test_set=df.copy()

Feature = df_train_test_set[[
    'Lithology Code', 
    'Well Spacing',
    'Net Pay Pay',
    'Gross Pay',
    'Porosity', 
    'Swi',
    'Oil FVFi',
    'Temp',
    'Permeability', 
    'API Gravity', 
    'Viscosity',
    'OOIP',
    'Initial GOR',
    'Pressure Initial',
    'Fractured Faulted',
    'Shale Breaks',
    'Major Gas Cap',
    'Geologic Play',
    'Deposition System',
    'Diagenetic Overprint',
    'Structural Comp',
    'Heterogeniety',
    'Trap Type'
]]
x=Feature

y = df_train_test_set['URF'].values

print(x.head())
print(y[0:5])
print(x.shape, y.shape)

   Lithology Code  Well Spacing  Net Pay Pay  Gross Pay  Porosity      Swi  \
0               1         120.0        320.0      600.0      14.0  40.0000   
1               1         160.0        100.0      160.0      18.1  35.0000   
2               1         160.0        100.0      300.0      15.7  51.6333   
3               1          80.0        350.0     1400.0      13.0  30.0000   
4               1         160.0        194.0      550.0      23.0  35.0000   

   Oil FVFi  Temp  Permeability  API Gravity  ...  Pressure Initial  \
0     1.586   153          10.0         41.0  ...            4251.0   
1     1.230   163          65.0         35.6  ...            4009.0   
2     1.190   185         102.0         32.9  ...            4457.0   
3     1.230   153           3.0         35.0  ...            3900.0   
4     1.380   200         450.0         27.0  ...            4300.0   

   Fractured Faulted  Shale Breaks  Major Gas Cap  Geologic Play  \
0                  0             0  

### Train-Test Split 70-30

In [8]:
from sklearn.model_selection import train_test_split

random_state = 42
test_size = 0.3

x_train, x_test, y_train, y_test  = train_test_split(
            x, y, test_size = test_size, random_state = random_state
)

print('Train Set: ', x_train.shape, y_train.shape)
print(x_train['Permeability'][0:5])
print('Test Set: ', x_test.shape, y_test.shape)
print(x_test['Permeability'][0:5])

Train Set:  (221, 23) (221,)
180      15.0
303      46.0
266       5.0
157       3.0
66     1000.0
Name: Permeability, dtype: float64
Test Set:  (96, 23) (96,)
73      50.0
280     38.0
25      70.0
255      5.0
9      400.0
Name: Permeability, dtype: float64


### Normalization as per Pycaret z-score

In [9]:
#https://towardsdatascience.com/data-normalization-with-pandas-and-scikit-learn-7c1cc6ed6475

from sklearn.preprocessing import StandardScaler

X_train = preprocessing.StandardScaler().fit(x_train).transform(x_train)
X_test = preprocessing.StandardScaler().fit(x_test).transform(x_test)
print('Standardization X Training Set: ', X_train[0:5])
print('Standardization X Testing Set: ', X_test[0:5])

Standardization X Training Set:  [[-7.12836926e-01 -4.93495232e-01 -6.22497137e-01 -3.73078637e-01
  -1.02154633e+00  2.07406413e+00 -2.33349946e-01 -8.84660361e-01
  -2.31496028e-01  6.56055401e-01 -7.40229581e-02 -1.98355456e-01
  -3.32463525e-01 -7.61729773e-01 -7.38548946e-01 -1.27708746e+00
  -4.98585570e-01 -5.54445821e-01 -9.23021962e-01 -6.09101686e-01
   2.81501824e+00 -6.71477276e-01 -4.10992682e-01]
 [-7.12836926e-01  7.02447893e-01 -6.27744681e-01 -6.53129649e-01
  -5.12532727e-01 -2.63073147e-01 -1.10331477e+00 -1.67543575e-01
  -2.15112018e-01  4.98652572e-01 -7.41421078e-02 -1.81534412e-01
  -9.12356315e-01  1.81604030e-01 -7.38548946e-01 -1.27708746e+00
  -4.98585570e-01  3.32071741e-01 -1.45396380e+00 -6.09101686e-01
  -4.54851202e-01 -6.71477276e-01  1.12848838e+00]
 [ 2.86754854e+00 -3.56816018e-01  3.89529241e-01 -2.17494741e-01
  -7.84795819e-01 -1.58406378e+00  3.29568472e-01 -7.65140897e-01
  -2.36781193e-01  9.39024380e-02 -7.42329595e-02 -1.13603274e-01
   1.67

### Transformation as per Pycaret 'yeo johnson'

In [10]:
from sklearn.preprocessing import PowerTransformer

Xt_train = preprocessing.PowerTransformer(method='yeo-johnson', standardize=False).fit(X_train).transform(X_train)
Xt_test = preprocessing.PowerTransformer(method='yeo-johnson', standardize=False).fit(X_test).transform(X_test)
print('Transformed X Training Set: ', Xt_train[0:5])
print('Transformed X Testing Set: ', Xt_test[0:5])

Transformed X Training Set:  [[-1.12159764e+00 -7.78502666e-01 -1.00040509e+00 -4.83416109e-01
  -1.19896066e+00  1.83426420e+00 -2.57120401e-01 -1.05550717e+00
  -4.29107361e-01  7.22299208e-01 -4.29824130e-01 -3.68773577e-01
  -3.83742718e-01 -9.51313530e-01 -1.16627752e+00 -7.73246180e-01
  -1.11245467e+00 -7.61731931e-01 -9.89205543e-01 -1.02231682e+00
   3.86086628e-01 -1.10652962e+00 -3.98464977e-01]
 [-1.12159764e+00  4.11856079e-01 -1.01243752e+00 -1.00453755e+00
  -5.60496483e-01 -2.69148777e-01 -1.62094243e+00 -1.74278723e-01
  -3.81688156e-01  5.37917878e-01 -4.31891241e-01 -3.20034305e-01
  -1.29475800e+00  1.71162059e-01 -1.16627752e+00 -7.73246180e-01
  -1.11245467e+00  2.75283494e-01 -1.60481639e+00 -1.02231682e+00
  -9.78308734e-01 -1.10652962e+00  1.21426564e+00]
 [ 1.00618954e+00 -4.99938710e-01  2.93419690e-01 -2.54141306e-01
  -8.92856999e-01 -1.75796147e+00  2.89747411e-01 -8.94733966e-01
  -4.45129311e-01  9.54035991e-02 -4.33473577e-01 -1.61673070e-01
   1.558850

## Note that Ignore Low Variance and Remove Outliers is not implemented as it is assumed it will not make a significant difference to model

### Create a confusion matrix

In [11]:
def plot_conf_mat(cnf_matrix, classes, normalize, cmap, width, height):
    plt.figure(figsize=(width, height))
    if normalize == True:
        # np.newaxis - make it as column vector by inserting an axis 
        # along second dimension
        cnf_matrix = cnf_matrix.astype('float')/ cnf_matrix.sum(
            axis=1)[:,np.newaxis]
        print("Normalized Confusion Matrix")
    else:
        print("Confusion Matrix, non-normalized")
    
    #imshow() - creates image from 2D numpy array.
    plt.imshow(cnf_matrix, interpolation = 'nearest', cmap=cmap)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks (tick_marks, classes, rotation=45)
    plt.yticks (tick_marks, classes)
    
    fmt = '.2f'if normalize else 'd'
    thres = cnf_matrix.max()/2
    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, format(cnf_matrix[i, j], fmt),
                 horizontalalignment='center',
                 fontsize=20,
                 color = 'yellow' if cnf_matrix[i, j] > thres else 'white'
                )
    # plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.grid(None)
    return

In [16]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor
win_rf=RandomForestRegressor(n_estimators=100, criterion="mse", max_depth=None, min_samples_split=2)
win_rf.fit(Xt_train, y_train)
yhat_rf = win_rf.predict(Xt_test)

# # accuracy_score(y_true, y_pred)
# mean_acc_rf = accuracy_score(y_test, yhat_rf)
# conf_mat_rf = confusion_matrix(y_test, yhat_rf)

# print('Random Forest')
# print('==============================================\n')
# print("True values:", y_test[0:5].round(1))
# print("Pred values:", yhat_rf[0:5].round(1))
# print('\n')
# print('Mean Accuracy:', mean_acc_rf)
# print('\n')
# print('F1 Score:\n',classification_report(y_test, yhat_rf))

# plot_conf_mat(conf_mat_rf, 
#               classes=['Wins (0)', 
#                        'Losses (1)'],
#               normalize=normalize, cmap=cmap, width=width, height=height)

In [30]:
errors = abs(yhat_rf - y_test)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'V/V')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.1 degrees.
Accuracy: 47.85 %.
