In [1]:
import openpyxl
from openpyxl import load_workbook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("dataset.xlsx", sheet_name=None, header=0)

# Section 4.3

In [3]:
all_df = {}
for i, d in enumerate(df.items()):
    name, data = d
    data['wellid'] = i
    all_df[name] = data
all_data = pd.concat(df.values())

In [4]:
all_data = all_data[[ "DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "Log_Facies", "Fluid", "wellid"]]
all_data = all_data.dropna(axis=0, how='any')
data = all_data

In [5]:
data = data[data['DEPTH'] == data["DEPTH"]]   # drop NaN value row

In [6]:
data['Log_Facies'] = data['Log_Facies'].apply(lambda x: int(x))

In [7]:
comb = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
def label_facies(row):
    f = row['Fluid']
    lf = row['Log_Facies']
    return comb.index((f, lf))
data['MixLabel'] = data.apply(label_facies, axis=1)

In [8]:
data.shape

(4695, 10)

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [10]:
X = data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB",]]
ss = StandardScaler()
X = ss.fit_transform(X)

In [11]:
Y1 = data['Log_Facies'].values
Y2 = data['Fluid'].values
Y3 = data['MixLabel'].values
Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']

# Cross validation

In [12]:
num_of_shuffler = 10
shuffler1 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y1)
shuffler2 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y2)
shuffler3 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y3)

In [13]:
s1 = [x for x in shuffler1]
s2 = [x for x in shuffler2]
s3 = [x for x in shuffler3]

In [14]:
indices1 = [(train_idx, validation_idx) for train_idx, validation_idx in s1]
indices2 = [(train_idx, validation_idx) for train_idx, validation_idx in s2]
indices3 = [(train_idx, validation_idx) for train_idx, validation_idx in s3]

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [16]:
CLF = LogisticRegression


In [17]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
all_indices = [indices1, indices2, indices3]
seq = 0
for p in [0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
    for (j, indices) in enumerate([indices1, indices2, indices3]):
        all_total, all_tested, all_correct = 0, 0, 0
        for i in range(num_of_shuffler):
            Y = ALL_Y[j]
            X_train = X[indices[i][0]]
            Y_train = Y[indices[i][0]]
            X_test = X[indices[i][1]]
            Y_test = Y[indices[i][1]]
            shape = Y_test.shape[0]
            model = CLF()
            model.fit(X_train, Y_train)
            pred = model.predict(X_test)
            pred_p = model.predict_proba(X_test)

            y_pred_filter = pred[pred_p.max(axis=1)>p]
            y_test_filter = Y_test[pred_p.max(axis=1)>p]
            
            total = pred.shape[0]
            tested = y_pred_filter.shape[0]
            correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
            all_total += total
            all_correct += correct
            all_tested += tested
        all_total /= 10
        all_correct /= 10
        all_tested /= 10
        seq += 1
        print("seq: {} label: {} prob: {} total: {} tested: {} tested percent: {:.2f} correct: {} correct percent {:.2f}".format(seq, Y_Label[j], p, all_total, all_tested, all_tested/all_total, all_correct, all_correct/all_tested ))
#             print(correct, tested, total, i, j, p)
    #         print(correct, total, correct/total)
#         ave = sum(count)/len(count)
#         print("average:", ave, ave/shape)

seq: 1 label: Log_Facies prob: 0 total: 470.0 tested: 470.0 tested percent: 1.00 correct: 380.7 correct percent 0.81
seq: 2 label: Fluid prob: 0 total: 470.0 tested: 470.0 tested percent: 1.00 correct: 446.7 correct percent 0.95
seq: 3 label: MixLabel prob: 0 total: 470.0 tested: 470.0 tested percent: 1.00 correct: 368.8 correct percent 0.78
seq: 4 label: Log_Facies prob: 0.5 total: 470.0 tested: 457.9 tested percent: 0.97 correct: 376.1 correct percent 0.82
seq: 5 label: Fluid prob: 0.5 total: 470.0 tested: 470.0 tested percent: 1.00 correct: 446.7 correct percent 0.95
seq: 6 label: MixLabel prob: 0.5 total: 470.0 tested: 405.7 tested percent: 0.86 correct: 334.2 correct percent 0.82
seq: 7 label: Log_Facies prob: 0.6 total: 470.0 tested: 404.8 tested percent: 0.86 correct: 348.1 correct percent 0.86
seq: 8 label: Fluid prob: 0.6 total: 470.0 tested: 456.6 tested percent: 0.97 correct: 440.5 correct percent 0.96
seq: 9 label: MixLabel prob: 0.6 total: 470.0 tested: 318.6 tested percen

# Cross Wells

In [18]:
len(all_df)

5

In [22]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
all_indices = [indices1, indices2, indices3]
seq = 0
for p in [0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
    for j in range(len(ALL_Y)):
        all_total, all_tested, all_correct = 0, 0, 0
        for i in range(len(all_df)):
            Y = ALL_Y[j]
            X_train = X[data['wellid']!=i]
            Y_train = Y[data['wellid']!=i]
            X_test = X[data['wellid']==i]
            Y_test = Y[data['wellid']==i]
            shape = Y_test.shape[0]
#             print(X_train.shape, X_test.shape, i, j, p)
            model = CLF()
            model.fit(X_train, Y_train)
            pred = model.predict(X_test)
            pred_p = model.predict_proba(X_test)

            y_pred_filter = pred[pred_p.max(axis=1)>p]
            y_test_filter = Y_test[pred_p.max(axis=1)>p]
            
            total = pred.shape[0]
            tested = y_pred_filter.shape[0]
            correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
            all_total += total
            all_correct += correct
            all_tested += tested
        all_total /= 10
        all_correct /= 10
        all_tested /= 10
        seq += 1
        print("seq: {} label: {} prob: {} total: {} tested: {} tested percent: {:.4f} correct: {} correct percent {:.4f}".format(seq, Y_Label[j], p, all_total, all_tested, all_tested/all_total, all_correct, all_correct/all_tested ))
#             print(correct, tested, total, i, j, p)
    #         print(correct, total, correct/total)
#         ave = sum(count)/len(count)

seq: 1 label: Log_Facies prob: 0 total: 469.5 tested: 469.5 tested percent: 1.0000 correct: 373.6 correct percent 0.7957
seq: 2 label: Fluid prob: 0 total: 469.5 tested: 469.5 tested percent: 1.0000 correct: 417.4 correct percent 0.8890
seq: 3 label: MixLabel prob: 0 total: 469.5 tested: 469.5 tested percent: 1.0000 correct: 332.4 correct percent 0.7080
seq: 4 label: Log_Facies prob: 0.5 total: 469.5 tested: 457.3 tested percent: 0.9740 correct: 368.2 correct percent 0.8052
seq: 5 label: Fluid prob: 0.5 total: 469.5 tested: 469.5 tested percent: 1.0000 correct: 417.4 correct percent 0.8890
seq: 6 label: MixLabel prob: 0.5 total: 469.5 tested: 398.8 tested percent: 0.8494 correct: 293.3 correct percent 0.7355
seq: 7 label: Log_Facies prob: 0.6 total: 469.5 tested: 396.7 tested percent: 0.8449 correct: 335.2 correct percent 0.8450
seq: 8 label: Fluid prob: 0.6 total: 469.5 tested: 461.3 tested percent: 0.9825 correct: 413.4 correct percent 0.8962
seq: 9 label: MixLabel prob: 0.6 total: 4

# Single Well

In [33]:
import itertools
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
all_indices = [indices1, indices2, indices3]
seq = 0
for p in [0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
    for j in range(len(ALL_Y)):
        for (m, n) in itertools.permutations([0, 1, 2, 3, 4], 2):
            all_total, all_tested, all_correct = 0, 0, 0
    #         for i in range(len(all_df)):
            Y = ALL_Y[j]
            X_train = X[data['wellid']==m]
            Y_train = Y[data['wellid']==m]
            X_test = X[data['wellid']==n]
            Y_test = Y[data['wellid']==n]
            shape = Y_test.shape[0]
    #         print(X_train.shape, X_test.shape, i, j, p)
            model = CLF()
            model.fit(X_train, Y_train)
            pred = model.predict(X_test)
            pred_p = model.predict_proba(X_test)

            y_pred_filter = pred[pred_p.max(axis=1)>p]
            y_test_filter = Y_test[pred_p.max(axis=1)>p]

            total = pred.shape[0]
            tested = y_pred_filter.shape[0]
            correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
            all_total += total
            all_correct += correct
            all_tested += tested
            all_total /= 10
            all_correct /= 10
            all_tested /= 10
            seq += 1
            print("seq: {} train: {} test: {}  label: {} prob: {} total: {} tested: {} tested percent: {:.4f} correct: {} correct percent {:.4f}".format(seq, m, n, Y_Label[j], p, all_total, all_tested, all_tested/all_total, all_correct, all_correct/(all_tested if all_tested > 0 else 0.1) ))

seq: 1 train: 0 test: 1  label: Log_Facies prob: 0 total: 44.6 tested: 44.6 tested percent: 1.0000 correct: 37.6 correct percent 0.8430
seq: 2 train: 0 test: 2  label: Log_Facies prob: 0 total: 102.5 tested: 102.5 tested percent: 1.0000 correct: 72.5 correct percent 0.7073
seq: 3 train: 0 test: 3  label: Log_Facies prob: 0 total: 109.4 tested: 109.4 tested percent: 1.0000 correct: 86.9 correct percent 0.7943
seq: 4 train: 0 test: 4  label: Log_Facies prob: 0 total: 95.3 tested: 95.3 tested percent: 1.0000 correct: 62.5 correct percent 0.6558
seq: 5 train: 1 test: 0  label: Log_Facies prob: 0 total: 117.7 tested: 117.7 tested percent: 1.0000 correct: 70.4 correct percent 0.5981
seq: 6 train: 1 test: 2  label: Log_Facies prob: 0 total: 102.5 tested: 102.5 tested percent: 1.0000 correct: 33.1 correct percent 0.3229
seq: 7 train: 1 test: 3  label: Log_Facies prob: 0 total: 109.4 tested: 109.4 tested percent: 1.0000 correct: 64.6 correct percent 0.5905
seq: 8 train: 1 test: 4  label: Log_Fa

seq: 118 train: 4 test: 1  label: MixLabel prob: 0.5 total: 44.6 tested: 44.6 tested percent: 1.0000 correct: 13.1 correct percent 0.2937
seq: 119 train: 4 test: 2  label: MixLabel prob: 0.5 total: 102.5 tested: 61.6 tested percent: 0.6010 correct: 35.8 correct percent 0.5812
seq: 120 train: 4 test: 3  label: MixLabel prob: 0.5 total: 109.4 tested: 76.0 tested percent: 0.6947 correct: 52.1 correct percent 0.6855
seq: 121 train: 0 test: 1  label: Log_Facies prob: 0.6 total: 44.6 tested: 31.9 tested percent: 0.7152 correct: 26.8 correct percent 0.8401
seq: 122 train: 0 test: 2  label: Log_Facies prob: 0.6 total: 102.5 tested: 77.4 tested percent: 0.7551 correct: 60.6 correct percent 0.7829
seq: 123 train: 0 test: 3  label: Log_Facies prob: 0.6 total: 109.4 tested: 93.1 tested percent: 0.8510 correct: 79.0 correct percent 0.8485
seq: 124 train: 0 test: 4  label: Log_Facies prob: 0.6 total: 95.3 tested: 83.1 tested percent: 0.8720 correct: 58.7 correct percent 0.7064
seq: 125 train: 1 test

seq: 233 train: 3 test: 0  label: MixLabel prob: 0.7 total: 117.7 tested: 75.4 tested percent: 0.6406 correct: 70.8 correct percent 0.9390
seq: 234 train: 3 test: 1  label: MixLabel prob: 0.7 total: 44.6 tested: 24.1 tested percent: 0.5404 correct: 10.1 correct percent 0.4191
seq: 235 train: 3 test: 2  label: MixLabel prob: 0.7 total: 102.5 tested: 52.7 tested percent: 0.5141 correct: 41.5 correct percent 0.7875
seq: 236 train: 3 test: 4  label: MixLabel prob: 0.7 total: 95.3 tested: 65.2 tested percent: 0.6842 correct: 45.3 correct percent 0.6948
seq: 237 train: 4 test: 0  label: MixLabel prob: 0.7 total: 117.7 tested: 32.4 tested percent: 0.2753 correct: 25.6 correct percent 0.7901
seq: 238 train: 4 test: 1  label: MixLabel prob: 0.7 total: 44.6 tested: 43.0 tested percent: 0.9641 correct: 12.8 correct percent 0.2977
seq: 239 train: 4 test: 2  label: MixLabel prob: 0.7 total: 102.5 tested: 24.0 tested percent: 0.2341 correct: 18.7 correct percent 0.7792
seq: 240 train: 4 test: 3  lab

seq: 350 train: 2 test: 1  label: MixLabel prob: 0.9 total: 44.6 tested: 2.0 tested percent: 0.0448 correct: 0.0 correct percent 0.0000
seq: 351 train: 2 test: 3  label: MixLabel prob: 0.9 total: 109.4 tested: 9.0 tested percent: 0.0823 correct: 7.0 correct percent 0.7778
seq: 352 train: 2 test: 4  label: MixLabel prob: 0.9 total: 95.3 tested: 7.9 tested percent: 0.0829 correct: 6.6 correct percent 0.8354
seq: 353 train: 3 test: 0  label: MixLabel prob: 0.9 total: 117.7 tested: 26.7 tested percent: 0.2268 correct: 26.7 correct percent 1.0000
seq: 354 train: 3 test: 1  label: MixLabel prob: 0.9 total: 44.6 tested: 4.1 tested percent: 0.0919 correct: 2.6 correct percent 0.6341
seq: 355 train: 3 test: 2  label: MixLabel prob: 0.9 total: 102.5 tested: 15.5 tested percent: 0.1512 correct: 12.7 correct percent 0.8194
seq: 356 train: 3 test: 4  label: MixLabel prob: 0.9 total: 95.3 tested: 36.3 tested percent: 0.3809 correct: 31.6 correct percent 0.8705
seq: 357 train: 4 test: 0  label: MixLa

seq: 410 train: 2 test: 1  label: MixLabel prob: 0.95 total: 44.6 tested: 1.0 tested percent: 0.0224 correct: 0.0 correct percent 0.0000
seq: 411 train: 2 test: 3  label: MixLabel prob: 0.95 total: 109.4 tested: 4.4 tested percent: 0.0402 correct: 3.1 correct percent 0.7045
seq: 412 train: 2 test: 4  label: MixLabel prob: 0.95 total: 95.3 tested: 4.8 tested percent: 0.0504 correct: 3.9 correct percent 0.8125
seq: 413 train: 3 test: 0  label: MixLabel prob: 0.95 total: 117.7 tested: 7.0 tested percent: 0.0595 correct: 7.0 correct percent 1.0000
seq: 414 train: 3 test: 1  label: MixLabel prob: 0.95 total: 44.6 tested: 0.2 tested percent: 0.0045 correct: 0.2 correct percent 1.0000
seq: 415 train: 3 test: 2  label: MixLabel prob: 0.95 total: 102.5 tested: 4.8 tested percent: 0.0468 correct: 3.8 correct percent 0.7917
seq: 416 train: 3 test: 4  label: MixLabel prob: 0.95 total: 95.3 tested: 15.6 tested percent: 0.1637 correct: 14.4 correct percent 0.9231
seq: 417 train: 4 test: 0  label: Mi

In [27]:
import itertools

In [30]:
list(itertools.permutations([1, 2, 3], 2))

[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]