#### This notebook cleans the test dataset and extends its feature set

In [24]:
import pandas as pd
import numpy as np
import utils
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [25]:
# Load the dataset
df_test = pd.read_csv('./data/illinois_basing_test.csv')
# Rename target column
df_test = df_test.rename(columns={'inj_diff\xa0': 'Target', 'SampleTimeUTC': 'Date'})
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test = df_test.sort_values('Date')

# Impute
df_test['Avg_VW1_ANPs_psi'] = df_test['Avg_VW1_ANPs_psi'].fillna(method='ffill')
df_test.fillna(0, inplace=True)

# Drop feature with many null values
for col in df_test.columns:
    if '6945' in col:
        df = df.drop(col, axis=1)

## Expand feature set

In [26]:
x = df_test.drop(['Date'], axis=1)

x = utils.get_temperature_diff_depth(x)
x = utils.get_pressure_diff_depth(x)
x['Diff pressure downhole-wellhead'] = x['Avg_CCS1_DH6325Ps_psi'] - x['Avg_CCS1_ANPs_psi']
cols = x.columns
lagged_df = utils.create_lag_features(x, features=cols, n_lags=5)

# TO CREATE THE MORE EXTENDED FEATURE SET:
x = utils.create_abs_features(x, x.columns)
x = utils.create_trend_features(x, x.columns)

  df['Pressure diff 6416-4917 ft'] = df['Avg_VW1_Z07D6416Ps_psi'] - df['Avg_VW1_Z11D4917Ps_psi']
  df['Pressure diff 5840-5653 ft'] = df['Avg_VW1_Z08D5840Ps_psi'] - df['Avg_VW1_Z09D5653Ps_psi']
  df['Pressure diff 5840-5482 ft'] = df['Avg_VW1_Z08D5840Ps_psi'] - df['Avg_VW1_Z0910D5482Ps_psi']
  df['Pressure diff 5840-5001 ft'] = df['Avg_VW1_Z08D5840Ps_psi'] - df['Avg_VW1_Z10D5001Ps_psi']
  df['Pressure diff 5840-4917 ft'] = df['Avg_VW1_Z08D5840Ps_psi'] - df['Avg_VW1_Z11D4917Ps_psi']
  df['Pressure diff 5653-5482 ft'] = df['Avg_VW1_Z09D5653Ps_psi'] - df['Avg_VW1_Z0910D5482Ps_psi']
  df['Pressure diff 5653-5001 ft'] = df['Avg_VW1_Z09D5653Ps_psi'] - df['Avg_VW1_Z10D5001Ps_psi']
  df['Pressure diff 5653-4917 ft'] = df['Avg_VW1_Z09D5653Ps_psi'] - df['Avg_VW1_Z11D4917Ps_psi']
  df['Pressure diff 5482-5001 ft'] = df['Avg_VW1_Z0910D5482Ps_psi'] - df['Avg_VW1_Z10D5001Ps_psi']
  df['Pressure diff 5482-4917 ft'] = df['Avg_VW1_Z0910D5482Ps_psi'] - df['Avg_VW1_Z11D4917Ps_psi']
  df['Pressure diff 50

In [27]:
lagged_df.fillna(np.nan, inplace=True)
lagged_df = lagged_df.replace([np.inf, -np.inf], np.nan)
df_test = pd.concat([x, lagged_df], axis=1)
df_test.to_csv('./data/test_df_expanded_feats.csv', index=False)