In [1]:
import numpy as np
import pandas as pd
import os
import rasterio
from GeoDS import hypercube
from GeoDS.prospectivity import hyperparameterstuning
from GeoDS import utilities
from GeoDS.supervised import mapclass
from GeoDS.prospectivity import reporting 
from GeoDS.prospectivity import featureimportance as fe
from GeoDS import eda
from GeoDS import datawrangle
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from joblib import dump, load
import glob
from dask import dataframe as dd

import optuna
from optuna import pruners
from imblearn.pipeline import Pipeline

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

#import tensorflow as tf
#import tensorflow_data_validation as tfdv
#from tensorflow_metadata.proto.v0 import schema_pb2

#print('TFDV Version: {}'.format(tfdv.__version__))
#print('Tensorflow Version: {}'.format(tf.__version__))

plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'

In [2]:
crs = 'epsg:26918'
AOI = 'Inputs/AOI/shape/AOI_geol.shp'
xRes = 5
yRes = 5
pixel_size = 5

# Random seed
random_state = 42

In [3]:
trial_name = 'Baseline_Model'

reporting_folder = os.path.join(trial_name, 'reporting/')
output_folder = os.path.join(trial_name, 'outputs/')
predictions_folder = os.path.join(trial_name, 'predictions/')
    
if not os.path.exists(reporting_folder):
    os.makedirs(reporting_folder)
        
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

if not os.path.exists(predictions_folder):
    os.makedirs(predictions_folder)   

In [4]:
df_cube = dd.read_csv(os.path.join(output_folder, 'cube-*.csv'))
df_cube.head()

Unnamed: 0,x,y,STructural_interpretation_Ludo_buffers_v2_dissolve_for_ML-shp,Critical_DTM_Prelim_Mosaic_modified,All_geology,STR_F_m,New M4 all_20210503,Metamorphism,Critical_Magres_Prelim_Mosaic,Critical_FVD_Prelim_Mosaic,Critical_SVD_Prelim_Mosaic,Pegmatites_All_MERN_Derived from outcrops_20210430,DEM,CRE_trench_Li100ppm_10mbuffer_positive,CRE_rock_Li100ppm_10mbuffer_positive,CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive,CRE_trench_Li5ppm_10mbuffer_negative,CRE_rock_Li5ppm_10mbuffer_negative
0,384727.5,5760787.5,,,,,,,,,,,,,,,,
1,384732.5,5760787.5,,,,,,,,,,,,,,,,
2,384737.5,5760787.5,,,,,,,,,,,,,,,,
3,384742.5,5760787.5,,,,,,,,,,,,,,,,
4,384747.5,5760787.5,,,,,,,,,,,,,,,,


In [5]:
df_cube = df_cube.compute()
df_cube.head()

Unnamed: 0,x,y,STructural_interpretation_Ludo_buffers_v2_dissolve_for_ML-shp,Critical_DTM_Prelim_Mosaic_modified,All_geology,STR_F_m,New M4 all_20210503,Metamorphism,Critical_Magres_Prelim_Mosaic,Critical_FVD_Prelim_Mosaic,Critical_SVD_Prelim_Mosaic,Pegmatites_All_MERN_Derived from outcrops_20210430,DEM,CRE_trench_Li100ppm_10mbuffer_positive,CRE_rock_Li100ppm_10mbuffer_positive,CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive,CRE_trench_Li5ppm_10mbuffer_negative,CRE_rock_Li5ppm_10mbuffer_negative
0,384727.5,5760787.5,,,,,,,,,,,,,,,,
1,384732.5,5760787.5,,,,,,,,,,,,,,,,
2,384737.5,5760787.5,,,,,,,,,,,,,,,,
3,384742.5,5760787.5,,,,,,,,,,,,,,,,
4,384747.5,5760787.5,,,,,,,,,,,,,,,,


In [7]:
np.unique(df_cube['CRE_trench_Li100ppm_10mbuffer_positive'], return_counts=True)

(array([ 0.,  1., nan]), array([       28,        35, 358840435]))

In [8]:
np.unique(df_cube['CRE_rock_Li100ppm_10mbuffer_positive'], return_counts=True)

(array([ 0.,  1., nan]), array([ 58622065,      1257, 300217176]))

In [9]:
np.unique(df_cube['CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive'], return_counts=True)

(array([ 0.,  1., nan]), array([    21247,       153, 358819098]))

In [10]:
np.unique(df_cube['CRE_trench_Li5ppm_10mbuffer_negative'], return_counts=True)

(array([ 0.,  1., nan]), array([        9,        21, 358840468]))

In [11]:
np.unique(df_cube['CRE_rock_Li5ppm_10mbuffer_negative'], return_counts=True)

(array([ 0.,  1., nan]), array([ 58621128,      2194, 300217176]))

# Feature Engineering

In [24]:
df_cube['CRE_trench_Li100ppm_10mbuffer_positive_'] = df_cube['CRE_trench_Li100ppm_10mbuffer_positive'].apply(lambda x: x if x in [1.] else np.nan)
np.unique(df_cube['CRE_trench_Li100ppm_10mbuffer_positive_'], return_counts=True)

(array([ 1., nan]), array([       35, 358840463]))

In [25]:
df_cube['CRE_rock_Li100ppm_10mbuffer_positive_'] = df_cube['CRE_rock_Li100ppm_10mbuffer_positive'].apply(lambda x: x if x in [1.] else np.nan)
np.unique(df_cube['CRE_rock_Li100ppm_10mbuffer_positive_'], return_counts=True)

(array([ 1., nan]), array([     1257, 358839241]))

In [26]:
df_cube['CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive_'] = df_cube['CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive'].apply(lambda x: x if x in [1.] else np.nan)
np.unique(df_cube['CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive_'], return_counts=True)

(array([ 1., nan]), array([      153, 358840345]))

In [27]:
df_cube['CRE_trench_Li5ppm_10mbuffer_negative_'] = df_cube['CRE_trench_Li5ppm_10mbuffer_negative'].apply(lambda x: x if x in [0.] else np.nan)
np.unique(df_cube['CRE_trench_Li5ppm_10mbuffer_negative_'], return_counts=True)

(array([ 0., nan]), array([        9, 358840489]))

In [49]:
df_cube['CRE_rock_Li5ppm_10mbuffer_negative_'] = df_cube['CRE_rock_Li5ppm_10mbuffer_negative'].apply(lambda x: x if x in [0.] else np.nan)
np.unique(df_cube['CRE_rock_Li5ppm_10mbuffer_negative_'], return_counts=True)

(array([ 0., nan]), array([ 58621128, 300219370]))

In [51]:
target = 'CRE_rock_Li5ppm_10mbuffer_negative_'
pos = 35 + 1257 + 153
neg = 9
tot_num = pos - neg

indices = df_cube.loc[df_cube[target] == 0].sample(n=tot_num, random_state=42)[target].index

In [58]:
indices = indices.to_list()

In [73]:
df_cube[target].apply(lambda x: x if df_cube[target].index.any() in indices else np.nan)

KeyboardInterrupt: 

In [75]:
df_cube[target].index.any() in indices

False

In [77]:
df_cube[target].index.to_list()

KeyboardInterrupt: 

In [71]:
df_cube[target].index

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            1759835, 1759836, 1759837, 1759838, 1759839, 1759840, 1759841,
            1759842, 1759843, 1759844],
           dtype='int64', length=358840498)

In [62]:
df_cube[target] = df_cube.loc[df_cube[target] == 0].sample(n=tot_num, random_state=42)[target]

KeyboardInterrupt: 

In [None]:
np.unique(df_cube['CRE_rock_Li5ppm_10mbuffer_negative_'], return_counts=True)

In [None]:
# use a mask
mask = [True if x else False for x in df_cube.index if x not in blacklist]
df.loc[mask]

In [59]:
df_cube.loc[~indices, target] = np.nan
np.unique(df_cube['CRE_rock_Li5ppm_10mbuffer_negative_'], return_counts=True)

TypeError: bad operand type for unary ~: 'list'

In [47]:
np.unique(df_cube['CRE_rock_Li5ppm_10mbuffer_negative_'], return_counts=True)

(array([ 0., nan]), array([ 58905004, 299935494]))

KeyboardInterrupt: 

In [None]:
df_cube['combined_pos'] = df_cube['CRE_rock_Li100ppm_10mbuffer_positive_'].combine_first(df_cube['CRE_DDH_Li100ppm_25mdepth_10mbuffer_positive_'])
np.unique(df_cube['combined_pos'], return_counts=True)