<a href="https://colab.research.google.com/github/maxmatical/Machine-Learning/blob/master/Yamana_Case_v3_Max.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Set up environment and download course-v3
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai
!pip install fastprogress
!pip install pathlib

!curl https://course-v3.fast.ai/setup/colab | bash

In [0]:
from fastai import *
from fastai.tabular import *

import pandas as pd
import matplotlib as plt
import numpy as np

In [0]:
path = '/content/data/'
os.makedirs(path, exist_ok=True)
fname = 'Base de Datos Geometalurgia Goldspot'
#path

In [0]:
df = pd.read_csv('/content/data/Base de Datos Geometalurgia Goldspot.csv', low_memory=False)

In [0]:
df.head(5)

Unnamed: 0,compid,dhid,midx,midy,midz,topx,topy,topz,botx,boty,...,Te,Th,Ti,Tl,U,V,W,Y,Zn,Zr
0,CAN,OF07482,449967.09,7301211.35,1819.01,449967.61,7301211.4,1819.01,449966.56,7301211.31,...,-99,-99,-99,-99,-99,-99.0,-99,-99,-99,-99.0
1,CAN,OF07491,449950.7,7301374.94,1836.81,449949.32,7301375.18,1836.81,449952.08,7301374.7,...,-99,-99,-99,-99,-99,-99.0,-99,-99,-99,-99.0
2,CAN,OF07505,449950.06,7301418.56,1836.86,449949.61,7301418.52,1836.86,449950.5,7301418.6,...,-99,-99,-99,-99,-99,-99.0,-99,-99,-99,-99.0
3,CAN,OF08219,449896.24,7301571.17,1677.28,449895.2,7301571.08,1677.28,449897.29,7301571.26,...,-99,-99,-99,-99,-99,-99.0,-99,-99,-99,-99.0
4,CAN,OF08222,449906.02,7301433.32,1601.0,449905.37,7301433.32,1601.0,449906.67,7301433.32,...,-99,-99,24,-99,-99,-99.0,3,-99,1000,-99.0


### Preprocessing
1. remove any instances of ">", "<", and "," from the data
2. Impute (or remove) any instances of -99 for 'agrec' or 'nacnc'
    - if imputing: use median value 
3. Change any instance of -99 in features to be np.nan (to use FillMissing)
4. use FillMissing to change NaN to -99, and add feature for var_na (bool)
5. Perform analysis on features to see if any features (other than top/mid/bot) are correlated, remove those variables from the modelling process
    - https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
6. Split features into 3 dataframes: 
    - Just geo data
    - Just geochemical data
    - both
7. Model using features, see what kind of results we get with subset of the features
8. Perform any feature importance analysis as needed

In [0]:
# stripping all str columns of > and < symbols
for column in df.columns:
    if df[column].dtype == 'O' and column != 'compid' and column != 'dhid':
        df[column] = df[column].map(lambda x: x.lstrip('>,<'))
        df[column] = df[column].str.replace(',','.')
        df[column] = df[column].astype(float)



In [0]:
df['nacnc'] = df['nacnc'].replace(-99,df['nacnc'].median(), inplace = False)

df['agrec'] = df['agrec'].replace(-99,df['agrec'].median(), inplace = False)

In [0]:
df = df.replace(-99, np.nan, inplace=False)


In [0]:
cat_names = ['geocod', 'bound']
cont_names = ['midx', 'midy', 'midz', 'topx', 'topy', 'topz', 'botx', 'boty', 'botz',
             'length', 'from', 'to','auppm', 'agppm', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca',
             'Cd', 'Co', 'Cr', 'Cu', 'Fe', 'Hg', 'K', 'La', 'Li', 'Mg', 'Mn', 'Mo', 'Na',
             'Nb', 'Ni', 'P', 'Pb', 'Pd', 'Re', 'S', 'Sb', 'Se', 'Si', 'Sc', 'Sn', 'Sr',
             'Ta', 'Te', 'Th', 'Ti', 'Tl', 'U', 'V', 'W', 'Y', 'Zn', 'Zr']

In [0]:
fillmissing = FillMissing(cat_names = cat_names, cont_names = cont_names, fill_strategy='CONSTANT', fill_val = -99.0)


In [0]:
fillmissing(df)

In [0]:
df.head(10)

Unnamed: 0,compid,dhid,midx,midy,midz,topx,topy,topz,botx,boty,...,Te_na,Th_na,Ti_na,Tl_na,U_na,V_na,W_na,Y_na,Zn_na,Zr_na
0,CAN,OF07482,449967.09,7301211.35,1819.01,449967.61,7301211.4,1819.01,449966.56,7301211.31,...,True,True,True,True,True,True,True,True,True,True
1,CAN,OF07491,449950.7,7301374.94,1836.81,449949.32,7301375.18,1836.81,449952.08,7301374.7,...,True,True,True,True,True,True,True,True,True,True
2,CAN,OF07505,449950.06,7301418.56,1836.86,449949.61,7301418.52,1836.86,449950.5,7301418.6,...,True,True,True,True,True,True,True,True,True,True
3,CAN,OF08219,449896.24,7301571.17,1677.28,449895.2,7301571.08,1677.28,449897.29,7301571.26,...,True,True,True,True,True,True,True,True,True,True
4,CAN,OF08222,449906.02,7301433.32,1601.0,449905.37,7301433.32,1601.0,449906.67,7301433.32,...,True,True,False,True,True,True,False,True,False,True
5,CAN,OF08223,449940.31,7301543.05,1773.45,449940.01,7301543.08,1773.45,449940.61,7301543.03,...,True,True,True,True,True,True,True,True,True,True
6,CAN,OF08264,449922.79,7301652.09,1854.98,449921.08,7301651.73,1854.98,449924.5,7301652.46,...,True,True,False,True,True,True,False,True,False,True
7,CAN,OF08306,449933.71,7301556.51,1773.12,449933.33,7301556.19,1773.12,449934.1,7301556.83,...,True,True,False,True,True,True,False,True,False,True
8,CAN,OF08389,449872.89,7301556.36,1600.87,449871.73,7301556.05,1600.87,449874.05,7301556.67,...,True,True,False,True,True,True,False,True,False,True
9,CAN,OF09245,449873.89,7301509.13,1547.19,449873.25,7301508.99,1547.19,449874.52,7301509.26,...,True,True,True,True,True,True,True,True,True,True


In [0]:
cat_names = ['geocod', 'bound',
            'Al_na', 'As_na', 'Au_na', 'B_na', 'Ba_na', 'Be_na', 'Bi_na', 'Ca_na',
             'Cd_na', 'Co_na', 'Cr_na', 'Cu_na', 'Fe_na', 'Hg_na', 'K_na', 'La_na', 'Li_na', 'Mg_na',
             'Mn_na', 'Mo_na', 'Na_na',
             'Nb_na', 'Ni_na', 'P_na', 'Pb_na', 'Pd_na', 'Re_na', 'S_na', 'Sb_na', 'Se_na',
             'Si_na', 'Sc_na', 'Sn_na', 'Sr_na',
             'Ta_na', 'Te_na', 'Th_na', 'Ti_na', 'Tl_na', 'U_na', 'V_na', 'W_na', 'Y_na', 'Zn_na', 'Zr_na']

cont_names= ['midx', 'midy', 'midz', 'topx', 'topy', 'topz', 'botx', 'boty', 'botz',
             'length', 'from', 'to','auppm', 'agppm', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca',
             'Cd', 'Co', 'Cr', 'Cu', 'Fe', 'Hg', 'K', 'La', 'Li', 'Mg', 'Mn', 'Mo', 'Na',
             'Nb', 'Ni', 'P', 'Pb', 'Pd', 'Re', 'S', 'Sb', 'Se', 'Si', 'Sc', 'Sn', 'Sr',
             'Ta', 'Te', 'Th', 'Ti', 'Tl', 'U', 'V', 'W', 'Y', 'Zn', 'Zr']

Finding correlated features

In [0]:
df_orig_features = df[['geocod', 'bound', 'midx', 'midy', 'midz', 'topx', 'topy', 'topz', 'botx', 'boty', 'botz',
             'length', 'from', 'to','auppm', 'agppm', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca',
             'Cd', 'Co', 'Cr', 'Cu', 'Fe', 'Hg', 'K', 'La', 'Li', 'Mg', 'Mn', 'Mo', 'Na',
             'Nb', 'Ni', 'P', 'Pb', 'Pd', 'Re', 'S', 'Sb', 'Se', 'Si', 'Sc', 'Sn', 'Sr',
             'Ta', 'Te', 'Th', 'Ti', 'Tl', 'U', 'V', 'W', 'Y', 'Zn', 'Zr']]

In [0]:
corr_matrix = df_orig_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))



In [0]:
print(upper)

In [0]:
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(to_drop)


['bound', 'topx', 'topy', 'topz', 'botx', 'boty', 'botz', 'Au', 'K', 'Pd', 'Re', 'Sn']


['bound', 'topx', 'topy', 'topz', 'botx', 'boty', 'botz', 'Au', 'K', 'Pd', 'Re', 'Sn'] are all highly correlated with other features in the dataset


### Questions
- should we drop highly correlated chemical information?(Au, K, etc.)
- what sets of features does auppm and agppm fit in?
- can we include [vsed, tfil, dens] as features (i.e. can they be easily obtained and used to predict our target variables?)

In [0]:
df_geo = df[['geocod','midx', 'midy', 'midz', 'length', 'from', 'to']]

df_chem = df[['auppm', 'agppm', 
             'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca',
             'Cd', 'Co', 'Cr', 'Cu', 'Fe', 'Hg', 'K', 'La', 'Li', 'Mg',
             'Mn', 'Mo', 'Na',
             'Nb', 'Ni', 'P', 'Pb', 'Pd', 'Re', 'S', 'Sb', 'Se',
             'Si', 'Sc', 'Sn', 'Sr',
             'Ta', 'Te', 'Th', 'Ti', 'Tl', 'U', 'V', 'W', 'Y', 'Zn', 'Zr', 
             'Al_na', 'As_na', 'Au_na', 'B_na', 'Ba_na', 'Be_na', 'Bi_na', 'Ca_na',
             'Cd_na', 'Co_na', 'Cr_na', 'Cu_na', 'Fe_na', 'Hg_na', 'K_na', 'La_na', 'Li_na', 'Mg_na',
             'Mn_na', 'Mo_na', 'Na_na',
             'Nb_na', 'Ni_na', 'P_na', 'Pb_na', 'Pd_na', 'Re_na', 'S_na', 'Sb_na', 'Se_na',
             'Si_na', 'Sc_na', 'Sn_na', 'Sr_na',
             'Ta_na', 'Te_na', 'Th_na', 'Ti_na', 'Tl_na', 'U_na', 'V_na', 'W_na', 'Y_na', 'Zn_na', 'Zr_na']]

df_all = df[['geocod','midx', 'midy', 'midz', 'length', 'from', 'to',
             'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca',
             'Cd', 'Co', 'Cr', 'Cu', 'Fe', 'Hg', 'K', 'La', 'Li', 'Mg',
             'Mn', 'Mo', 'Na',
             'Nb', 'Ni', 'P', 'Pb', 'Pd', 'Re', 'S', 'Sb', 'Se',
             'Si', 'Sc', 'Sn', 'Sr',
             'Ta', 'Te', 'Th', 'Ti', 'Tl', 'U', 'V', 'W', 'Y', 'Zn', 'Zr', 
             'Al_na', 'As_na', 'Au_na', 'B_na', 'Ba_na', 'Be_na', 'Bi_na', 'Ca_na',
             'Cd_na', 'Co_na', 'Cr_na', 'Cu_na', 'Fe_na', 'Hg_na', 'K_na', 'La_na', 'Li_na', 'Mg_na',
             'Mn_na', 'Mo_na', 'Na_na',
             'Nb_na', 'Ni_na', 'P_na', 'Pb_na', 'Pd_na', 'Re_na', 'S_na', 'Sb_na', 'Se_na',
             'Si_na', 'Sc_na', 'Sn_na', 'Sr_na',
             'Ta_na', 'Te_na', 'Th_na', 'Ti_na', 'Tl_na', 'U_na', 'V_na', 'W_na', 'Y_na', 'Zn_na', 'Zr_na']]

y_au = df['aurec']
y_ag = df['agrec']
y_nac = df['nacnc']
y_codt = df['codt']

In [0]:
# defining function to evaluate exp_rmspe
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

## Predicting codt class

### just geo info

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_geo, y_codt, test_size = 0.2)

In [0]:
m = RandomForestClassifier(n_estimators=100, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)

m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.99, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [0]:
m.score(x_test, y_test)

0.9705882352941176

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(m.predict(x_test), y_test)

array([[ 97,   0,   1,   1],
       [  0,   1,   0,   0],
       [  2,   0,  33,   0],
       [  1,   3,   0, 133]])

In [0]:
from sklearn.metrics import classification_report

print(classification_report(m.predict(x_test), y_test))

              precision    recall  f1-score   support

           1       0.97      0.98      0.97        99
           2       0.25      1.00      0.40         1
           3       0.97      0.94      0.96        35
           4       0.99      0.97      0.98       137

   micro avg       0.97      0.97      0.97       272
   macro avg       0.80      0.97      0.83       272
weighted avg       0.98      0.97      0.97       272



### just chem info

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_chem, y_codt, test_size = 0.2)

In [0]:
m = RandomForestClassifier(n_estimators=100, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)

m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.99, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [0]:
m.score(x_test, y_test)

0.6727941176470589

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(m.predict(x_test), y_test)

array([[73,  1, 14, 37],
       [ 0,  0,  0,  1],
       [ 4,  0, 15,  1],
       [21,  2,  8, 95]])

In [0]:
from sklearn.metrics import classification_report

print(classification_report(m.predict(x_test), y_test))

              precision    recall  f1-score   support

           1       0.74      0.58      0.65       125
           2       0.00      0.00      0.00         1
           3       0.41      0.75      0.53        20
           4       0.71      0.75      0.73       126

   micro avg       0.67      0.67      0.67       272
   macro avg       0.46      0.52      0.48       272
weighted avg       0.70      0.67      0.68       272



### All info

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_all, y_codt, test_size = 0.2)

In [0]:
m = RandomForestClassifier(n_estimators=100, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)

m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.99, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [0]:
m.score(x_test, y_test)

0.9742647058823529

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(m.predict(x_test), y_test)

array([[109,   0,   2,   1],
       [  0,   0,   0,   0],
       [  1,   0,  30,   0],
       [  1,   2,   0, 126]])

In [0]:
from sklearn.metrics import classification_report

print(classification_report(m.predict(x_test), y_test))

              precision    recall  f1-score   support

           1       0.98      0.97      0.98       112
           2       0.00      0.00      0.00         0
           3       0.94      0.97      0.95        31
           4       0.99      0.98      0.98       129

   micro avg       0.97      0.97      0.97       272
   macro avg       0.73      0.73      0.73       272
weighted avg       0.98      0.97      0.98       272



  'recall', 'true', average, warn_for)


# New Section