In [134]:
import numpy as np
import pandas as pd
import main
las_data = pd.read_csv(main.csv_out_file, delimiter=';')

# удаляем скважины, по которым нет водонасыщенности
well_name_list = las_data['well_name'].value_counts().index.tolist()
for well_name in well_name_list:
    well_data = las_data[las_data['well_name'] == well_name]
    if np.isnan(well_data['kvo'].mean()):
        # axis  = 0, удаляем столбцы
        las_data = las_data.drop(las_data[las_data['well_name'] == well_name].index, axis = 0)

In [139]:
def calc_mean_value_by_h(h_arr, val_arr):
    hv_sum = 0.
    h_sum = 0.
    prev_nan_val = False
    for h, val in zip(h_arr, val_arr):
        if np.isnan(h) or np.isnan(val):
            prev_nan_val = True
        elif not prev_nan_val:
            h_sum += h
            hv_sum += h * val
        else:
            prev_nan_val = False
    # print(h_sum, hv_sum)
    return hv_sum/h_sum

In [140]:
# получим нефтенасыщенность
las_data['kno'] = 1 - las_data['kvo']

# заполним данные по скважинам
well_name_list = las_data['well_name'].value_counts().index.tolist()
data_well_for_analize = pd.DataFrame() 
first = True
kno_list = []
for well_name in well_name_list:
    # print(well_name)
    data_well_list = []
    # Выделяем даные для одной скважины
    data_well = las_data[las_data['well_name'] == well_name]
    kno_list.append(calc_mean_value_by_h(data_well['DEPT'],data_well['kno']))
    

data_well_for_analize = pd.DataFrame({'well_name':well_name_list, 'kno':kno_list})
data_well_for_analize.head()

Unnamed: 0,kno,well_name
0,0.186791,4R
1,0.234957,643PL
2,0.256349,3WZ
3,0.260787,4WZ
4,0.198295,554


In [141]:
# добавляем wc
water_content_data = pd.read_csv(main.data_dir + "\\res.csv", delimiter=';')
water_content_data = water_content_data.rename(columns={'Скважина': main.keys_dict[main.kid_well]})
water_content_data = water_content_data.rename(columns={' Начальная обводненность ': 'wc'})
del water_content_data['Дата запуска']
data_for_analize = pd.merge(data_well_for_analize, water_content_data, on=main.keys_dict[main.kid_well])
data_for_analize.head()

Unnamed: 0,kno,well_name,wc
0,0.186791,4R,0.52
1,0.198295,554,0.08
2,0.167707,519,0.16
3,0.087187,529,0.37
4,0.133083,536,0.26


In [142]:
# оставлеям только данные для x и y отдельно
y = data_for_analize['wc']
well_names = data_for_analize['well_name']
data_for_analize.drop(['wc', 'well_name'], axis=1, inplace=True)
# округлим вектор ответов
y_r = int(10) * round(y,1)
y_r = list(map(int, y_r))

In [143]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score

x_train, x_hold, y_train, y_hold = train_test_split(data_for_analize.values, y_r, test_size=0.3, random_state=17)



In [144]:
tree = DecisionTreeClassifier(max_depth=20, random_state=99)
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_hold)
accuracy_score(y_hold, tree_pred)

0.25490196078431371

In [148]:
from sklearn.model_selection import GridSearchCV, cross_val_score
count_vals = data_for_analize.shape[1]
tree_params = {'max_depth': range(1,30)}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)
tree_grid.fit(x_train, y_train)

Fitting 5 folds for each of 29 candidates, totalling 145 fits




[Parallel(n_jobs=-1)]: Done 145 out of 145 | elapsed:    1.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': range(1, 30)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=True)

In [149]:
tree_grid.best_params_

{'max_depth': 1}

In [150]:
tree_grid.best_score_

0.24576271186440679

In [151]:
accuracy_score(y_hold, tree_grid.predict(x_hold))

0.27450980392156865