In [2]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
import sys
sys.path.insert(1, '../scripts/')
import data_munging_tools as dmt
import utils

In [3]:
# Load the dataframes
test_df = pd.read_csv('../data/cleaned-input.test.tsv', sep='\t', low_memory=False)
train_df = pd.read_csv('../data/cleaned-input.training.tsv', sep='\t', low_memory=False)

In [4]:
blacklist_patterns = ['^recent_ipt_', '^production_', 'total_num_stages', 'bakken_isopach_ft']

In [5]:
test_df.shape

(1586, 53)

In [6]:
whitelist = ['production_liquid_180']

In [7]:
train_df.columns

Index(['FileNo', 'CountyName', 'CurrentOperator', 'CurrentWellName', 'DFElev',
       'FieldName', 'Footages', 'GRElev', 'KBElev', 'LeaseName', 'LeaseNumber',
       'OriginalOperator', 'OriginalWellName', 'ProducedPools', 'QQ', 'Range',
       'Section', 'TD', 'Township', 'WellStatus', 'WellType', 'Wellbore',
       'api', 'bakken_isopach_ft', 'bh_lat', 'bh_lng', 'choke_size', 'legs',
       'max_tvd', 'mean_tvd', 'min_tvd', 'num_pools_produced',
       'production_liquid_120', 'production_liquid_150',
       'production_liquid_180', 'production_liquid_1825',
       'production_liquid_270', 'production_liquid_30',
       'production_liquid_365', 'production_liquid_60',
       'production_liquid_730', 'production_liquid_90', 'spud_date', 'std_tvd',
       'stimulated_formation', 'surface_lat', 'surface_lng',
       'total_lbs_proppant', 'total_num_stages', 'total_volume_bbls', 'tvd',
       'type_treatment', 'well_status_date'],
      dtype='object')

In [8]:
utils.get_bad_vals_summaries(train_df, train_df.columns)

Unnamed: 0,num_missing,perc_missing,num_zero,perc_zero,num_neg,perc_neg
FileNo,0.0,0.0,0.0,0.0,0.0,0.0
CountyName,0.0,0.0,,,,
CurrentOperator,0.0,0.0,,,,
CurrentWellName,0.0,0.0,,,,
DFElev,6527.0,99.97,0.0,0.0,0.0,0.0
FieldName,0.0,0.0,,,,
Footages,0.0,0.0,,,,
GRElev,1138.0,17.43,0.0,0.0,0.0,0.0
KBElev,393.0,6.02,0.0,0.0,0.0,0.0
LeaseName,0.0,0.0,,,,


In [9]:
def munge_pipe(df, blacklist_patterns=[], exceptions={}, null_cutoff=.05):
    '''
    parameters: dataframe, blacklist patterns (as list), exceptions to blacklist patterns
        (as set)
    returns: copy of munged dataframe
    '''
    print(f"df shape before removals {df.shape}")
    
    df = (df.copy()
            .pipe(dmt.drop_blacklist, blacklist_patterns=blacklist_patterns, exceptions=exceptions)
            .pipe(dmt.drop_high_cardinality, exceptions=exceptions)
            .pipe(dmt.drop_high_nulls, exceptions=exceptions, cutoff=null_cutoff)
            )
    
    print (f"df shape after removals {df.shape}")
    return df

In [10]:
munged_df = munge_pipe(train_df, blacklist_patterns=blacklist_patterns)

df shape before removals (6529, 53)
Shape before cardinality removal: {}


AttributeError: 'NoneType' object has no attribute 'format'

In [13]:
train_df = munge_pipe(train_df, blacklist_patterns=my_blacklist_patterns, exceptions=set([TARGET_1]), null_cutoff=.18)

df shape before removals (6529, 48)
Shape before blacklist removal: (6529, 48)
Blacklisted columns: ['bakken_isopach_ft', 'production_liquid_120', 'production_liquid_150', 'production_liquid_180', 'production_liquid_1825', 'production_liquid_270', 'production_liquid_30', 'production_liquid_365', 'production_liquid_60', 'production_liquid_730', 'total_num_stages']
Number of blacklisted columns: 11
Shape after blacklist removal: (6529, 37)
**************************************************
Shape before cardinality removal: (6529, 37)
Dropped CurrentWellName since it was categorical and had a high cardinality
Dropped Footages since it was categorical and had a high cardinality
Dropped LeaseName since it was categorical and had a high cardinality
Dropped LeaseNumber since it was categorical and had a high cardinality
Dropped OriginalWellName since it was categorical and had a high cardinality
Shape after cardinality removal: (6529, 32)
**************************************************
Sha

In [14]:
test_df = munge_pipe(test_df, blacklist_patterns=my_blacklist_patterns, exceptions=set([TARGET_1]), null_cutoff=.18)

df shape before removals (1586, 48)
Shape before blacklist removal: (1586, 48)
Blacklisted columns: ['bakken_isopach_ft', 'production_liquid_120', 'production_liquid_150', 'production_liquid_180', 'production_liquid_1825', 'production_liquid_270', 'production_liquid_30', 'production_liquid_365', 'production_liquid_60', 'production_liquid_730', 'total_num_stages']
Number of blacklisted columns: 11
Shape after blacklist removal: (1586, 37)
**************************************************
Shape before cardinality removal: (1586, 37)
Dropped CurrentWellName since it was categorical and had a high cardinality
Dropped DFElev since it was empty
Dropped Footages since it was categorical and had a high cardinality
Dropped LeaseName since it was categorical and had a high cardinality
Dropped LeaseNumber since it was categorical and had a high cardinality
Dropped OriginalWellName since it was categorical and had a high cardinality
Shape after cardinality removal: (1586, 31)
********************

### Split

In [15]:
print train_df.shape

(6529, 31)


In [16]:
print test_df.shape

(1586, 31)


In [17]:
test_df[TARGET_1].isnull().sum()

28

In [18]:
train_df[TARGET_1].isnull().sum()

133

In [None]:
# Build Models

etr = ExtraTreesRegressor(n_estimators=TREE_COUNT, max_depth=MAX_DEPTH, n_jobs=-1)

etr.fit(rejoined_train_df, y_train)

### Imputation

In [109]:
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill, MICE, MatrixFactorization, IterativeSVD

In [110]:
#instantiate imputers:
sf_median = SimpleFill(fill_method="median")
sf_mean = SimpleFill(fill_method="mean")
knn_imputer = KNN(k=5, verbose=0)
mice_imputer = MICE(verbose=0, )
mf_imputer = MatrixFactorization(verbose=0)
soft_imputer = SoftImpute(verbose=0)
svd_imputer = IterativeSVD
nonnormed_imputers_dict = {"sf_median" : sf_median, "sf_mean" : sf_mean, "knn_imputer" : knn_imputer}
imputers_dict = {"sf_median" : sf_median, "sf_mean" : sf_mean, "knn_imputer" : knn_imputer, "mice_imputer" : mice_imputer}
all_imputers_dict = {"sf_median" : sf_median, "sf_mean" : sf_mean, "knn_imputer" : knn_imputer, "mice_imputer": mice_imputer, "mf_imputer": mf_imputer}

In [168]:
def fancy_impute_pipe(train_df, test_df, target, imputer):
    """
    Parameters: training dataframe, testing dataframe, target variable name (as a string), imputer object
    Returns: filled and binarized training dataframe, filled and binarized training dataframe
    """
    test_df = test_df.copy()
    train_df = train_df.copy()

    # Drop rows with missing target values
    test_df.dropna(subset=[target], inplace=True)
    train_df.dropna(subset=[target], inplace=True)
    test_df.reset_index(inplace=True)
    train_df.reset_index(inplace=True)

    #create flags for test and train
    flag_test_train(train_df, test_df)

    ### Split into X and y
    X_train, y_train = X_y_split(train_df, target)
    X_test, y_test = X_y_split(test_df, target)

    #Merge train and test for binarization of train and test and imputation of test
    merged_df = pd.concat([X_train, X_test])

    #split into numeric and nonnumeric
    numeric_df, nonnumeric_df = split_numerical_features(merged_df, verbose=0)
    
    
    #Binarize nonnumeric features
    binarized_df = pd.get_dummies(nonnumeric_df)

    #resplit into train and test
    numerics_train_df = numeric_df[numeric_df["flag"] == 0]
    numerics_test_df = numeric_df[numeric_df["flag"] == 1]
    binarized_train_df = binarized_df[binarized_df["flag_str_train"] == 1]
    binarized_test_df = binarized_df[binarized_df["flag_str_test"] == 1]

    #perform imputations
    filled_train_df = fancy_impute(numerics_train_df, imputer)
    filled_df = fancy_impute(numeric_df, imputer)
    

    #scaling and/or imputing creates rounding error
    filled_df["flag"] = filled_df["flag"].round(0)

    #separate imputed test set from imputed train set
    filled_test_df = filled_df[filled_df["flag"] == 1]
    
    #rejoin test and train
    binarized_train_df.reset_index(inplace=True, drop=True)
    binarized_test_df.reset_index(inplace=True, drop=True) 
    filled_train_df.reset_index(inplace=True, drop=True)
    filled_test_df.reset_index(inplace=True, drop=True)
    
    
    rejoined_train_df = filled_train_df.join(binarized_train_df)
    rejoined_test_df = filled_test_df.join(binarized_test_df)
    
    print "rejoined train", short_info(rejoined_train_df), "\n"
    
    print "rejoined test", short_info(rejoined_test_df)

    return rejoined_train_df, rejoined_test_df, y_train, y_test

In [169]:
rejoined_train_df, rejoined_test_df, y_train, y_test = fancy_impute_pipe(train_df, test_df, TARGET_1, mice_imputer)

rejoined train 
**************************************************
dataframe name: []
shape: (6396, 847)
index: RangeIndex(start=0, stop=6396, step=1)
Nulls exist: False
None 

rejoined test 
**************************************************
dataframe name: []
shape: (1558, 847)
index: RangeIndex(start=0, stop=1558, step=1)
Nulls exist: False
None


In [170]:
sum(rejoined_train_df.isnull().sum())

0

In [171]:
etr.score(rejoined_test_df, y_test)

0.60731483621468763

### GBT

In [175]:
from sklearn.ensemble import GradientBoostingRegressor

In [191]:
grid.best_params_

{'learning_rate': 0.05,
 'max_depth': 5,
 'min_samples_split': 3,
 'n_estimators': 1000,
 'subsample': 0.6}

In [199]:
grid.best_params_

{'learning_rate': 0.01,
 'max_depth': 7,
 'min_samples_split': 3,
 'n_estimators': 2000,
 'subsample': 0.7}

In [200]:
gbr = GradientBoostingRegressor(learning_rate=0.01, n_estimators=2000, subsample = .7, max_depth =7, min_samples_split= 3, random_state=1984)

In [201]:
gbr.fit(rejoined_train_df, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=3, min_weight_fraction_leaf=0.0,
             n_estimators=2000, presort='auto', random_state=1984,
             subsample=0.7, verbose=0, warm_start=False)

In [202]:
gbr.score(rejoined_test_df, y_test)

0.6266726640775433

In [196]:
gbr.score(rejoined_test_df, y_test)

0.61430816909047947

In [197]:
from sklearn.model_selection import GridSearchCV

gbr2 = GradientBoostingRegressor(random_state=1984)

params = {"learning_rate": [.001, .01, .05], "n_estimators": [1000, 2000], "max_depth": [3, 5, 7], "min_samples_split": [3, 4], "subsample": [.5, .6, .7]}

grid = GridSearchCV(estimator=gbr2,param_grid=params, n_jobs=-1)


In [198]:
grid.fit(rejoined_train_df, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=1984,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [1000, 2000], 'min_samples_split': [3, 4], 'learning_rate': [0.001, 0.01, 0.05], 'max_depth': [3, 5, 7], 'subsample': [0.5, 0.6, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

## Model Evaluation

In [None]:
etr.score(X_test, y_test)

In [None]:
mft.eval_model(etr, X_test, y_test, y_train)

In [None]:
mft.eval_model(gbr, X_test, y_test, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

### Feature Importances

In [None]:
feature_array = np.array(X_train.columns)

In [None]:
models = [etr, gbr]

In [None]:
mft.most_important_features(etr, feature_array)

In [None]:
mft.most_important_features(gbr, feature_array)

In [None]:
#whoops, index is still in there

In [12]:
import pandas as pd
import numpy as np
import re
from fancyimpute import BiScaler, SimpleFill
import model_fitting_tools as mft





def drop_high_cardinality(df, exceptions={}, id_col=""):
    '''
    Drop cardinality == 0, cardinality == 1, cardinality == n,
    or (type='categorical and cardinality > 0.2 * n)
    '''
    print ("Shape before cardinality removal: {}").format(df.shape)
    for col in df.columns:
        if col in exceptions:
            continue
        else:
            if df[col].count() == 0:
                # drop cardinality = 0 (empty columns)
                df.drop(col, inplace=True, axis=1)
                print 'Dropped {} since it was empty'.format(col)
            elif df[col].count() == 1:
                # drop cardinality = 1
                df.drop(col, inplace=True, axis=1)
                print 'Dropped {} since it was always the same'.format(col)
            elif df[col].count() == df[col].value_counts().idxmax():
                # drop cardinality == count
                df.drop(col, inplace=True, axis=1)
                print 'Dropped {} since it was always unique'.format(col)
            elif col != id_col and df[col].dtype == 'object' and len(df[col].value_counts()) > len(df) * 0.2:
                df.drop(col, inplace=True, axis=1)
                print 'Dropped {} since it was categorical and had a high cardinality'.format(col)
    print ("Shape after cardinality removal: {}").format(df.shape)

def drop_high_nulls(df, exceptions={}, cutoff=0.5):
    print ("Shape before high null removal: {}").format(df.shape)
    for col in df.columns:
        if col in exceptions:
            continue
        else:
            prop_missing = df[col].isnull().sum() / float(df[col].shape[0])
            if prop_missing > cutoff:
                df.drop(col, inplace=True, axis=1)
                print 'Dropped {} since it had a high proportion of missing values. {}'.format(col, prop_missing)
    print ("Shape before high null removal: {}").format(df.shape)

def drop_categorical_features (df):
    print "Shape before removal: {}".format(df.shape)
    columns_removed= []
    for col in df.columns:
        if df[col].dtypes == object:
            df.drop(col, inplace=True, axis=1)
            columns_removed.append(col)
    print "Categorical olumns dropped: {}".format(columns_removed)
    print "Shape after removal: {}".format(df.shape)

def drop_nonnumeric_features (df):
    df = df.copy()
    print "Shape before removal: {}".format(df.shape)
    columns_removed= []
    for col in df.columns:
        if df[col].dtypes != float and df[col].dtypes != int:
            df.drop(col, inplace=True, axis=1)
            columns_removed.append(col)
    print "Columns dropped: {}".format(columns_removed)
    print "Shape after removal: {}".format(df.shape)
    return df

def split_numerical_features(df, verbose=1):
    numeric_cols = []
    nonnumeric_cols = []
    for col in df.columns:
        if df[col].dtypes == float or df[col].dtypes == int:
            numeric_cols.append(col)
        else:
            nonnumeric_cols.append(col)
    numeric_df = df[numeric_cols]
    nonnumeric_df = df[nonnumeric_cols]
    if verbose == 1:
        print "numeric columns: {}".format(numeric_cols)
        print "non-numeric columns: {}".format(nonnumeric_cols)
    return numeric_df, nonnumeric_df

def fancy_impute(df, imputer):
    '''
    fills numerical dataframe with fancy imputer and returns completed dataframe
    '''
    if type(imputer) != SimpleFill:

        biscaler = BiScaler(verbose=0)
    
        normed = biscaler.fit_transform(df.as_matrix())

        filled_mat = imputer.complete(normed)
        filled_mat = biscaler.inverse_transform(filled_mat)

    else:
        filled_mat = imputer.complete(df)

    filled_df = pd.DataFrame(filled_mat, columns= df.columns)

    return filled_df

def extra_fancy_impute(df, simple_imputer, fancy_imputer, important_features):
    '''
    first, fill all nulls on most features with a simple imputation method, like median().
    second, fill remaining nulls on important features with fancy imputer.
    '''
    first_pass_df = df[df.columns.difference(important_features)]
    first_pass_filled = simple_imputer.complete(first_pass_df)
    second_pass = np.concatenate((first_pass_filled, df[important_features].as_matrix()), axis=1)
    print first_pass_filled.shape, df[important_features].as_matrix().shape
    print second_pass.shape
    biscaler = BiScaler(verbose=0)
    normed = biscaler.fit_transform(second_pass)
    filled_mat = fancy_imputer.complete(normed)
    filled_mat = biscaler.inverse_transform(filled_mat)
    filled_df = pd.DataFrame(filled_mat, columns= df.columns)
    return filled_df





def flag_test_train(df_train, df_test, string_flag=True):
    '''
    #create two flags for test and train, where one flag is a string, the other is a binary
    '''
    df_train["flag"] = 0
    df_test["flag"] = 1
    if string_flag == True:
        df_train["flag_str"] = "train"
        df_test["flag_str"] = "test"
        
def X_y_split(df, target):
    '''
    params: df, target variable (as string),
    returns: df_X, df_y
    '''
    df_y = df[target]
    df_X = df.drop(target, axis=1)
    return df_X, df_y
