In [1]:
import openpyxl
from openpyxl import load_workbook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("dataset.xlsx", sheet_name=None, header=0)

# Section 4.1 (Table 4.1.1 and Table 4.1.4)

# Table 4.1.1

In [3]:
all_df = {}
for i, d in enumerate(df.items()):
    name, data = d
    data['wellid'] = i
    all_df[name] = data
all_data = pd.concat(df.values())
# all_data = all_df['1X-02']

In [4]:
all_data = all_data[[ "DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "Log_Facies", "Fluid", "wellid"]]
all_data = all_data.dropna(axis=0, how='any')
data = all_data

In [5]:
data = data[data['DEPTH'] == data["DEPTH"]]   # drop NaN value row

In [6]:
data['Log_Facies'] = data['Log_Facies'].apply(lambda x: int(x))

In [7]:
comb = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
def label_facies(row):
    f = row['Fluid']
    lf = row['Log_Facies']
    return comb.index((f, lf))
data['MixLabel'] = data.apply(label_facies, axis=1)

In [8]:
data.shape

(4695, 10)

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [10]:
X = data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
ss = StandardScaler()
X = ss.fit_transform(X)
OX = X

In [11]:
Y1 = data['Log_Facies'].values
Y2 = data['Fluid'].values
Y3 = data['MixLabel'].values
Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']

In [12]:
num_of_shuffler = 10
shuffler1 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y1)
shuffler2 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y2)
shuffler3 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y3)

In [13]:
s1 = [x for x in shuffler1]
s2 = [x for x in shuffler2]
s3 = [x for x in shuffler3]

In [14]:
indices1 = [(train_idx, validation_idx) for train_idx, validation_idx in s1]
indices2 = [(train_idx, validation_idx) for train_idx, validation_idx in s2]
indices3 = [(train_idx, validation_idx) for train_idx, validation_idx in s3]

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [16]:
CLF = LogisticRegression


In [17]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
all_indices = [indices1, indices2, indices3]
seq = 0

for k in range(7):
    X = np.concatenate([OX[:, :k], OX[:,k+1:]], axis=1)
    for (j, indices) in enumerate([indices1, indices2, indices3]):
        all_total, all_correct = 0, 0
        for i in range(num_of_shuffler):
            Y = ALL_Y[j]
            X_train = X[indices[i][0]]
            Y_train = Y[indices[i][0]]
            X_test = X[indices[i][1]]
            Y_test = Y[indices[i][1]]
            shape = Y_test.shape[0]
            model = CLF()
            model.fit(X_train, Y_train)
            pred = model.predict(X_test)

            y_pred_filter = pred
            y_test_filter = Y_test

            total = pred.shape[0]
            correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
            all_total += total
            all_correct += correct
            
#             print(model.coef_)
#             print(model.intercept_)

        all_total /= 10
        all_correct /= 10
        seq += 1
        col = ["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "nothing"][k]
        print("without: {} label: {} total: {} correct: {} correct percent {:.3f}".format(col, Y_Label[j], all_total, all_correct, all_correct/all_total ))

without: DEPTH label: Log_Facies total: 470.0 correct: 380.7 correct percent 0.810
without: DEPTH label: Fluid total: 470.0 correct: 443.8 correct percent 0.944
without: DEPTH label: MixLabel total: 470.0 correct: 362.2 correct percent 0.771
without: DT label: Log_Facies total: 470.0 correct: 380.6 correct percent 0.810
without: DT label: Fluid total: 470.0 correct: 445.0 correct percent 0.947
without: DT label: MixLabel total: 470.0 correct: 366.0 correct percent 0.779
without: GR label: Log_Facies total: 470.0 correct: 379.0 correct percent 0.806
without: GR label: Fluid total: 470.0 correct: 442.7 correct percent 0.942
without: GR label: MixLabel total: 470.0 correct: 358.9 correct percent 0.764
without: LLD label: Log_Facies total: 470.0 correct: 380.6 correct percent 0.810
without: LLD label: Fluid total: 470.0 correct: 386.7 correct percent 0.823
without: LLD label: MixLabel total: 470.0 correct: 310.8 correct percent 0.661
without: NPHI label: Log_Facies total: 470.0 correct: 35

# Table 4.1.4

In [18]:
well_index = 1  # 1 mean 1F-11
tmp_data = data[data['wellid'] == well_index]
X = tmp_data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
ss = StandardScaler()
X = ss.fit_transform(X)
OX = X

In [19]:
test_data = data[data['wellid'] == 2]
#test_data = data[data['wellid'] != well_index]
X_T =  test_data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
XT = ss.transform(X_T)

In [20]:
Y1 = tmp_data['Log_Facies'].values
Y2 = tmp_data['Fluid'].values
Y3 = tmp_data['MixLabel'].values
Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']
ALL_Y_TRAIN = [Y1, Y2, Y3]

In [21]:
Y1_test = test_data['Log_Facies'].values
Y2_test = test_data['Fluid'].values
Y3_test = test_data['MixLabel'].values
ALL_Y_TEST = [Y1_test, Y2_test, Y3_test]

In [22]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
seq = 0

for k in range(7):
    X = np.concatenate([OX[:, :k], OX[:,k+1:]], axis=1)
    X_T = np.concatenate([XT[:, :k], XT[:,k+1:]], axis=1)
    
    for j in range(3):
        all_total, all_correct = 0, 0
        X_train = X
        Y_train = ALL_Y_TRAIN[j]
        X_test = X_T
        Y_test = ALL_Y_TEST[j]
#         shape = Y_test.shape[0]
        model = CLF()
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)

        y_pred_filter = pred
        y_test_filter = Y_test

        total = pred.shape[0]
        correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
        all_total += total
        all_correct += correct

#             print(model.coef_)
#             print(model.intercept_)

#         all_total /= 10
#         all_correct /= 10
        seq += 1
        col = ["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "nothing"][k]
        print("without: {} label: {} total: {} correct: {} correct percent {:.3f}".format(col, Y_Label[j], all_total, all_correct, all_correct/all_total ))

without: DEPTH label: Log_Facies total: 1025 correct: 560 correct percent 0.546
without: DEPTH label: Fluid total: 1025 correct: 699 correct percent 0.682
without: DEPTH label: MixLabel total: 1025 correct: 471 correct percent 0.460
without: DT label: Log_Facies total: 1025 correct: 336 correct percent 0.328
without: DT label: Fluid total: 1025 correct: 321 correct percent 0.313
without: DT label: MixLabel total: 1025 correct: 104 correct percent 0.101
without: GR label: Log_Facies total: 1025 correct: 64 correct percent 0.062
without: GR label: Fluid total: 1025 correct: 321 correct percent 0.313
without: GR label: MixLabel total: 1025 correct: 104 correct percent 0.101
without: LLD label: Log_Facies total: 1025 correct: 64 correct percent 0.062
without: LLD label: Fluid total: 1025 correct: 321 correct percent 0.313
without: LLD label: MixLabel total: 1025 correct: 104 correct percent 0.101
without: NPHI label: Log_Facies total: 1025 correct: 69 correct percent 0.067
without: NPHI la