In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statistics
import statsmodels.api as sm
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel


plt.rcParams['figure.figsize'] = (16, 12)
plt.rcParams['font.size'] = 14
plt.style.use("fivethirtyeight")

%matplotlib inline

In [2]:
#import cleaned csv
logs=pd.read_csv('Data_Cleaned.csv', index_col=0)
#Import cleaned and scaled csv
logs_scaled=pd.read_csv('Scaled_Data.csv', index_col=0)

In [3]:
logs=logs.dropna()
logs.shape
logs['Cluster']=logs['Cluster'].astype(str)
logs.dtypes

Observation_Num             int64
UWI_PROXY                  object
depth                     float64
curve1                    float64
curve2                    float64
curve3                    float64
curve4                    float64
curve5                    float64
target_log                float64
curve1_diff               float64
curve3_diff               float64
curve4_diff               float64
curve5_diff               float64
Target_diff               float64
curve1_diff_2nd           float64
curve3_diff_2nd           float64
curve4_diff_2nd           float64
curve5_diff_2nd           float64
Target_diff_2nd           float64
Roll_curve1_diff_2nd      float64
Roll_curve3_diff_2nd      float64
Roll_curve4_diff_2nd      float64
Roll_curve5_diff_2nd      float64
Roll_Target_diff_2nd      float64
Roll20_curve1_diff_2nd    float64
Roll20_curve1             float64
Ln_curve3                 float64
Cluster                    object
cluster_0                 float64
cluster_1     

In [4]:
logs.head()

Unnamed: 0,Observation_Num,UWI_PROXY,depth,curve1,curve2,curve3,curve4,curve5,target_log,curve1_diff,...,Roll_Target_diff_2nd,Roll20_curve1_diff_2nd,Roll20_curve1,Ln_curve3,Cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4
21,22,uwi1,7510.5,71.005,0.002,1.855,0.352,2.616,50.0,0.451,...,133.373017,0.999059,5.79563,0.617885,4.0,0.0,0.0,0.0,0.0,1.0
22,23,uwi1,7511.0,70.049,0.005,1.844,0.355,2.578,80.958,0.956,...,753.41764,0.969979,5.733276,0.611937,4.0,0.0,0.0,0.0,0.0,1.0
23,24,uwi1,7511.5,67.642,0.024,1.844,0.358,2.566,93.694,2.407,...,646.029264,0.885825,5.606104,0.611937,4.0,0.0,0.0,0.0,0.0,1.0
24,25,uwi1,7512.0,66.297,0.0,1.843,0.361,2.574,85.435,1.345,...,504.704977,0.824823,5.272441,0.611395,4.0,0.0,0.0,0.0,0.0,1.0
25,26,uwi1,7512.5,66.074,0.001,1.856,0.364,2.582,69.069,0.223,...,469.930625,0.816285,5.031,0.618424,4.0,0.0,0.0,0.0,0.0,1.0


In [21]:
#Assign independant and dependant variables
feature_col=logs.columns.drop(['Observation_Num','UWI_PROXY','depth','target_log','Roll_Target_diff_2nd','Target_diff','Target_diff_2nd','cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','curve3'])
X=logs[feature_col]
y=logs.target_log
#list(X)

['curve1',
 'curve2',
 'curve4',
 'curve5',
 'curve1_diff',
 'curve3_diff',
 'curve4_diff',
 'curve5_diff',
 'curve1_diff_2nd',
 'curve3_diff_2nd',
 'curve4_diff_2nd',
 'curve5_diff_2nd',
 'Roll_curve1_diff_2nd',
 'Roll_curve3_diff_2nd',
 'Roll_curve4_diff_2nd',
 'Roll_curve5_diff_2nd',
 'Roll20_curve1_diff_2nd',
 'Roll20_curve1',
 'Ln_curve3',
 'Cluster']

In [23]:
#Split the Data into Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 123)

In [24]:
rfreg = RandomForestRegressor(n_estimators=200, max_features=5, oob_score=True, random_state=1)
rfreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=20, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=None, oob_score=True, random_state=1,
           verbose=0, warm_start=False)

In [25]:
# Compute feature importances.
a=pd.DataFrame({'features':feature_col, 'importance':rfreg.feature_importances_}).sort_values(by='importance',ascending=False)

Unnamed: 0,features,importance
18,Ln_curve3,0.602473
19,Cluster,0.241359
0,curve1,0.040408
2,curve4,0.032086
3,curve5,0.020958
17,Roll20_curve1,0.008143
16,Roll20_curve1_diff_2nd,0.006511
13,Roll_curve3_diff_2nd,0.006004
1,curve2,0.005854
15,Roll_curve5_diff_2nd,0.005231


In [None]:
# Find the average RMSE.
scores = cross_val_score(rfreg, X_test, y_test, max features=10, scoring='neg_mean_squared_error')
print(np.mean(np.sqrt(-scores)))

In [9]:
#Assign new independant variables based on learning
feature_col=['curve1','Ln_curve3','curve4','curve5','Roll20_curve1','Cluster','Roll20_curve1_diff_2nd']
X=logs[feature_col]
y=logs.target_log
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 123)

In [10]:
#Fit resgression based on hand selected features
rfreg = RandomForestRegressor(n_estimators=200, max_features=5, oob_score=True, random_state=1)
rfreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=None, oob_score=True, random_state=1,
           verbose=0, warm_start=False)

In [11]:
# Compute feature importances.
pd.DataFrame({'features':feature_col, 'importance':rfreg.feature_importances_}).sort_values(by='importance',ascending=False)

Unnamed: 0,features,importance
1,Ln_curve3,0.490492
5,Cluster,0.189334
2,curve4,0.157818
0,curve1,0.098897
3,curve5,0.029115
4,Roll20_curve1,0.017207
6,Roll20_curve1_diff_2nd,0.017138


In [13]:
# Check the RMSE for a random forest that only includes hand selected features
scores = cross_val_score(rfreg, X, y, cv=10, scoring='neg_mean_squared_error')
print('RMSE:', np.mean(np.sqrt(-scores)))

RMSE: 5.1152709872983415


In [None]:
#Optimize N-Estimators for Model with >mean significance
estimator_range = list(range(50, 501, 50))
RMSE_scores = []
# Use five-fold cross-validation with each value of n_estimators (Warning: Slow!).
for n in estimator_range:
    rfreg = RandomForestRegressor(n_estimators=n, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, y, cv=5, scoring='neg_mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
# Plot RMSE (y-axis) versus n_estimators (x-axis).
plt.plot(estimator_range, RMSE_scores);
plt.xlabel('n_estimators');
plt.ylabel('RMSE (lower is better)');

In [None]:
#Optimize the numer of features 
feature_range = list(range(1,7))
RMSE_scores = []

# Use 10-fold cross-validation with each value of max_features (Warning: Super slow!).
for f in feature_range:
    rfreg = RandomForestRegressor(n_estimators= FROM ABOVE, max_features=f, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, y, cv=10, scoring='neg_mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

In [None]:
# Create Final Tree with optimal N and F with selected features
rfreg = RandomForestRegressor(n_estimators=FROM ABOVE, max_features=FROM ABOVE, oob_score=True, random_state=1)
rfreg.fit(X_train, y_train)
# Compute feature importances.
pd.DataFrame({'features':feature_col, 'importance':rfreg.feature_importances_}).sort_values(by='importance',ascending=False)


In [None]:
#Decide if i want to use mean or median to proceed
#rint(SelectFromModel(rfreg, threshold='mean', prefit=True).transform(X_train).shape)
#rint(SelectFromModel(rfreg, threshold='median', prefit=True).transform(X_train).shape)

In [None]:
#Create new data frame with only those variables that exceed the significance threshold
#X_important =  SelectFromModel(rfreg, threshold='mean', prefit=True).transform(X_test)