In [1]:
import openpyxl
from openpyxl import load_workbook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_excel("dataset.xlsx", sheet_name=None, header=0)

In [4]:
all_df = {}
for i, d in enumerate(df.items()):
    name, data = d
    data['wellid'] = i
    all_df[name] = data
    
all_data = pd.concat(df.values())
# all_data = all_df['1X-02']

In [5]:
all_data = all_data[[ "DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "Log_Facies", "Fluid", "wellid"]]
all_data = all_data.dropna(axis=0, how='any')
data = all_data

In [6]:
data = data[data['DEPTH'] == data["DEPTH"]]   # drop NaN value row

In [7]:
data['Log_Facies'] = data['Log_Facies'].apply(lambda x: int(x))

In [8]:
comb = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
def label_facies(row):
    f = row['Fluid']
    lf = row['Log_Facies']
    return comb.index((f, lf))
data['MixLabel'] = data.apply(label_facies, axis=1)

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [10]:
X = data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
ss = StandardScaler()
X = ss.fit_transform(X)

In [11]:
Y1 = data['Log_Facies'].values
Y2 = data['Fluid'].values
Y3 = data['MixLabel'].values
Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']

In [12]:
num_of_shuffler = 10
shuffler1 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y1)
shuffler2 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y2)
shuffler3 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y3)

In [13]:
s1 = [x for x in shuffler1]
s2 = [x for x in shuffler2]
s3 = [x for x in shuffler3]

In [14]:
indices1 = [(train_idx, validation_idx) for train_idx, validation_idx in s1]
indices2 = [(train_idx, validation_idx) for train_idx, validation_idx in s2]
indices3 = [(train_idx, validation_idx) for train_idx, validation_idx in s3]

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [16]:
CLF = LogisticRegression


In [17]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
all_indices = [indices1, indices2, indices3]
seq = 0

for (j, indices) in enumerate([indices1, indices2, indices3]):
    all_total, all_correct = 0, 0
    for i in range(num_of_shuffler):
        
        Y = ALL_Y[j]
        X_train = X[indices[i][0]]
        Y_train = Y[indices[i][0]]
        
        X_test = X[indices[i][1]]
        Y_test = Y[indices[i][1]]
        
        shape = Y_test.shape[0]
        model = CLF()
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)

        y_pred_filter = pred
        y_test_filter = Y_test

        total = pred.shape[0]
        correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
        all_total += total
        all_correct += correct

    all_total /= 10
    all_correct /= 10
    seq += 1
    print("label: {} total: {} correct: {} correct percent {:.4f}".format(Y_Label[j], all_total, all_correct, all_correct/all_total ))

label: Log_Facies total: 470.0 correct: 381.1 correct percent 0.8109
label: Fluid total: 470.0 correct: 449.9 correct percent 0.9572
label: MixLabel total: 470.0 correct: 371.2 correct percent 0.7898


In [18]:
count = []
shape = 1
ALL_Y = [Y1, Y2, Y3]
seq = 0



    
    
for j in range(3):

    all_total, all_correct = 0, 0
    for i in range(len(all_df)):

        well_index = i
        tmp_data = data[data['wellid'] != well_index]
        X = tmp_data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
        ss = StandardScaler()
        X = ss.fit_transform(X)
        OX = X

        test_data = data[data['wellid'] == well_index]
#             print(test_data.shape, well_index)
        X_T =  test_data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
        XT = ss.transform(X_T)

        Y1 = tmp_data['Log_Facies'].values
        Y2 = tmp_data['Fluid'].values
        Y3 = tmp_data['MixLabel'].values
        Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']
        ALL_Y_TRAIN = [Y1, Y2, Y3]

        Y1_test = test_data['Log_Facies'].values
        Y2_test = test_data['Fluid'].values
        Y3_test = test_data['MixLabel'].values
        ALL_Y_TEST = [Y1_test, Y2_test, Y3_test]

#             X = np.concatenate([OX[:, :k], OX[:,k+1:]], axis=1)
#             X_T = np.concatenate([XT[:, :k], XT[:,k+1:]], axis=1)
        X = OX
        X_T = XT


        X_train = X
        Y_train = ALL_Y_TRAIN[j]
        X_test = X_T
        Y_test = ALL_Y_TEST[j]
#         shape = Y_test.shape[0]
        model = CLF()
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)

        y_pred_filter = pred
        y_test_filter = Y_test

        total = pred.shape[0]
        correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
        all_total += total
        all_correct += correct
       
        #print("### {}. without: {} label: {} total: {} correct: {} correct percent {:.4f}".format(i, col, Y_Label[j], all_total, all_correct, all_correct/all_total ))

#             print(model.coef_)
#             print(model.intercept_)

#         all_total /= 10
#         all_correct /= 10
    seq += 1
    
    print("label: {} total: {} correct: {} correct percent {:.4f}".format( Y_Label[j], all_total, all_correct, all_correct/all_total ))

label: Log_Facies total: 4695 correct: 3739 correct percent 0.7964
label: Fluid total: 4695 correct: 4161 correct percent 0.8863
label: MixLabel total: 4695 correct: 3324 correct percent 0.7080


In [19]:
count = []
shape = 1

seq = 0

for (name, data) in all_df.items():
    
    all_data = data[[ "DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB", "Log_Facies", "Fluid"]]
    all_data = all_data.dropna(axis=0, how='any')
    data = all_data
    data = data[data['DEPTH'] == data["DEPTH"]] 
    data['Log_Facies'] = data['Log_Facies'].apply(lambda x: int(x))
    
    comb = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
    def label_facies(row):
        f = row['Fluid']
        lf = row['Log_Facies']
        return comb.index((f, lf))
    data['MixLabel'] = data.apply(label_facies, axis=1)
    
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.preprocessing import StandardScaler
    
    X = data[["DEPTH", "DT", "GR", "LLD", "NPHI", "RHOB"]]
#     ss = StandardScaler()
#     X = ss.fit_transform(X)
    
    Y1 = data['Log_Facies'].values
    Y2 = data['Fluid'].values
    Y3 = data['MixLabel'].values
    Y_Label = ['Log_Facies', 'Fluid', 'MixLabel']
    
    num_of_shuffler = 10
    shuffler1 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y1)
    shuffler2 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y2)
    shuffler3 = StratifiedShuffleSplit(n_splits=num_of_shuffler, test_size=0.1).split(X, Y3)
    
    s1 = [x for x in shuffler1]
    s2 = [x for x in shuffler2]
    s3 = [x for x in shuffler3]
    
    indices1 = [(train_idx, validation_idx) for train_idx, validation_idx in s1]
    indices2 = [(train_idx, validation_idx) for train_idx, validation_idx in s2]
    indices3 = [(train_idx, validation_idx) for train_idx, validation_idx in s3]
    
    ALL_Y = [Y1, Y2, Y3]
    all_indices = [indices1, indices2, indices3]
    for (j, indices) in enumerate([indices1, indices2, indices3]):
        all_total, all_correct = 0, 0
        for i in range(num_of_shuffler):
            
            Y = ALL_Y[j]
            X_train = X.values[indices[i][0]]
            Y_train = Y[indices[i][0]]

            X_test = X.values[indices[i][1]]
            Y_test = Y[indices[i][1]]
            
            ss = StandardScaler()
            X_train = ss.fit_transform(X_train)
            X_test = ss.transform(X_test)

            shape = Y_test.shape[0]
            model = CLF()
            model.fit(X_train, Y_train)
            pred = model.predict(X_test)

            y_pred_filter = pred
            y_test_filter = Y_test

            total = pred.shape[0]
            correct = y_pred_filter[y_pred_filter==y_test_filter].shape[0]
            all_total += total
            all_correct += correct

        all_total /= 10
        all_correct /= 10
        seq += 1
        print("well: {} label: {} total: {} correct: {} correct percent {:.4f}".format(name, Y_Label[j], all_total, all_correct, all_correct/all_total ))

well: 1D-02 label: Log_Facies total: 118.0 correct: 104.1 correct percent 0.8822
well: 1D-02 label: Fluid total: 118.0 correct: 116.9 correct percent 0.9907
well: 1D-02 label: MixLabel total: 118.0 correct: 104.1 correct percent 0.8822
well: 1F-11 label: Log_Facies total: 45.0 correct: 40.2 correct percent 0.8933
well: 1F-11 label: Fluid total: 45.0 correct: 43.5 correct percent 0.9667
well: 1F-11 label: MixLabel total: 45.0 correct: 39.7 correct percent 0.8822
well: 1K-01 label: Log_Facies total: 103.0 correct: 79.3 correct percent 0.7699
well: 1K-01 label: Fluid total: 103.0 correct: 101.3 correct percent 0.9835
well: 1K-01 label: MixLabel total: 103.0 correct: 82.6 correct percent 0.8019
well: 1X-02 label: Log_Facies total: 110.0 correct: 94.4 correct percent 0.8582
well: 1X-02 label: Fluid total: 110.0 correct: 108.8 correct percent 0.9891
well: 1X-02 label: MixLabel total: 110.0 correct: 91.1 correct percent 0.8282
well: 98_6-8 label: Log_Facies total: 96.0 correct: 78.1 correct p