In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
{Calcination Analysis - Notebook 1
Predictive modeling of reactivity based on geochemical and geomechanical data.}

{INTERNAL USE ONLY}
"""

from sklearn.feature_selection import f_regression
from pycaret.regression import setup, evaluate_model, compare_models, tune_model, dashboard
import seaborn as sns
import numpy as np
import pandas as pd
__author__ = '{Malte Schade}'
__copyright__ = 'Copyright {2022}, {Calcination Analysis - Notebook 1}'
__version__ = '{1}.{0}.{0}'
__maintainer__ = '{Malte Schade}'
__email__ = '{contact@malteschade.com}'
__status__ = '{PROTOTYPE}'

# built-in modules
import os
import collections.abc
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping

# parameters
META_PATH = 'data/cleaned.csv'

# settings
pd.options.display.max_columns = None
sns.set(rc={'figure.figsize': (16, 9)})


In [14]:
# reading of raw datasets
df_calc = pd.read_csv(os.path.dirname(os.getcwd(
))+'/data/Conso/calcinationdata.csv', encoding="latin_1", index_col=False)
df_chem = pd.read_csv(os.path.dirname(os.getcwd()) +
                      '/data/Conso/chemdata.csv', encoding="latin_1")
df_dense = pd.read_csv(
    '/home/lhoist/code/calcination/data/Conso/Densitydata.csv', encoding='latin1')

# definition of regular expression patterns
rep_nan = '^$'
rep_none = '\'|¢XC|°C|N.A|-|¢X|#REF!|c|\+|SAL|\d+/\d+|ND|/|t|\*|\d+°\:'
rep_dot = ','


def replace(x: object) -> object:
    '''
    Function to replace patterns in data with correct formatting,
    clean data and do type casting.

    Parameters
    ----------
    x: object
        Dataframe string field value.

    Returns
    ----------
    x/y: object
        Either original object or transformed object as float.
    '''
    if x.dtype == object:
        y = x.str.replace(rep_dot, '.', regex=True)
        y = y.str.replace(rep_none, '', regex=True)
        y = y.str.replace('..', '.', regex=False)
        y = y.replace(rep_nan, np.nan, regex=True)
        y = y.astype(float)
        return y
    return x


# apply transformation function
calc_num = ['temp', 't70', 'sbi5', 'sbi4', 'sbi2', 'sht5', 'sht4', 'sht2',
            'ati5', 'ati4', 'ati2', 'co2', 't60', 't2', 't6', 'wu2', 'wu5', 'wu10', 'pf550', 'pf950']
df_calc[calc_num] = df_calc[calc_num].transform(replace)

# drop unused columns
df_calc.drop(columns=['mykey', 'mylen', 'checksum',
             'testid', 'isindb'], inplace=True)

# filter data by ignore tag
df_chem = df_chem[df_chem['ignore'] == 'F']

# drop more unused columns
df_chem.drop(columns=['site_na', 'stamp', 'tempkey', 'serie', 'cal_des',
             'dpasse', 'fpasse', 'ignore', 'e_cd', 'da_id', 'e_cat'], inplace=True)

# define shared index
df_calc.set_index('nech', inplace=True)
df_chem.set_index('nech', inplace=True)

# join dataframes together
df = df_calc.join(df_chem, how='inner')

# merge dataframes with density data
df = pd.merge(df.reset_index(), df_dense.rename(
    columns={'N_lims': 'n_lims'}), on='e_cd', how='left')

# definable filters
f_keys = ['e_cd', 'site_na', 'produit', 'temp', 't60', 't70']
exc_list = ['fname', 'e_type', 'n_lims_x', 'n_lims_y', 'Nature',
            't2', 't6', 'wu2', 'wu5', 'wu10',
            'cao', 'co2', 'pf', 'pf550', 'pf700', 'pf950', 'pf1000',
            'lime', 'cao_m', 'sio2hum',
            'fe2o3li', 'na2oaa', 'k2oaa', 'sroaa', 'p2o5icp', 'pbxrf', 'fe2o3aa', 'mnoaa']

# drop columns baseed on exclusion list
df.drop(columns=exc_list, inplace=True)

# sort first columns
df = df[f_keys+[c for c in df if c not in f_keys]]
df = df.replace(0, np.nan)

# save df to csv
df.to_csv(os.path.join(os.path.dirname(os.getcwd()), META_PATH))

# drop rows based on dependent variable subset
df = df.dropna(axis=0, subset='t60')
#df = df.dropna(axis=0)

# filter by product type
df = df[df['produit'] == 'CALC']
df


Unnamed: 0,e_cd,site_na,produit,temp,t60,t70,nech,sbi5,sbi4,sbi2,sht5,sht4,sht2,ati5,ati4,ati2,mgo,sio2,fe2o3,al2o3,s,mno,p2o5,sro,na2o,k2o,bao,tio2,va,zn,corg,Density,Porosity,WA24
16,AGF 5,Araguaia,CALC,980.0,0.20,,AGF005,2.76,0.73,0.42,5.04,2.75,0.92,5.14,3.47,2.82,0.15,0.20,0.03,0.03,0.002,0.0003,0.0246,0.4375,0.0021,0.0028,35.0000,0.0024,0.45,1.0,0.05,,,
17,AGF 5,Araguaia,CALC,1060.0,0.48,,AGF005,2.72,0.51,0.29,3.60,2.18,0.76,4.28,3.00,2.47,0.15,0.20,0.03,0.03,0.002,0.0003,0.0246,0.4375,0.0021,0.0028,35.0000,0.0024,0.45,1.0,0.05,,,
18,AGF 5,Araguaia,CALC,1140.0,2.03,,AGF005,3.26,0.80,0.38,3.02,1.69,0.67,4.48,2.90,2.40,0.15,0.20,0.03,0.03,0.002,0.0003,0.0246,0.4375,0.0021,0.0028,35.0000,0.0024,0.45,1.0,0.05,,,
19,AGF 5,Araguaia,CALC,1200.0,5.45,,AGF005,4.00,0.58,0.31,3.31,1.63,0.51,3.72,2.49,2.00,0.15,0.20,0.03,0.03,0.002,0.0003,0.0246,0.4375,0.0021,0.0028,35.0000,0.0024,0.45,1.0,0.05,,,
20,AGF 6,Araguaia,CALC,980.0,0.23,,AGF006,2.46,0.75,0.42,4.43,2.45,0.78,5.26,3.96,3.29,0.22,0.38,0.03,0.07,0.006,0.0004,0.0336,0.4264,0.0021,0.0104,34.0000,0.0031,0.74,1.2,0.08,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3323,WRK 7,Warak,CALC,900.0,8.42,,WRK007,1.20,0.84,0.70,16.15,12.37,8.73,37.26,32.16,30.87,0.41,0.14,0.02,0.02,0.028,0.0013,0.0028,0.1050,0.1390,0.0103,0.0003,0.0016,12.00,1.8,0.09,2.51,7.2,
3324,WRK 7,Warak,CALC,980.0,9.50,,WRK007,1.17,0.62,0.49,15.10,11.30,6.93,39.30,33.90,30.64,0.41,0.14,0.02,0.02,0.028,0.0013,0.0028,0.1050,0.1390,0.0103,0.0003,0.0016,12.00,1.8,0.09,2.51,7.2,
3325,WRK 7,Warak,CALC,1060.0,12.70,,WRK007,1.15,0.68,0.54,15.63,11.69,6.72,45.91,39.94,34.30,0.41,0.14,0.02,0.02,0.028,0.0013,0.0028,0.1050,0.1390,0.0103,0.0003,0.0016,12.00,1.8,0.09,2.51,7.2,
3326,WRK 7,Warak,CALC,1140.0,9.30,,WRK007,0.84,0.47,0.34,15.63,11.69,6.72,48.44,43.09,37.27,0.41,0.14,0.02,0.02,0.028,0.0013,0.0028,0.1050,0.1390,0.0103,0.0003,0.0016,12.00,1.8,0.09,2.51,7.2,


In [8]:
# show numer of samples per temperature
df['temp'].value_counts()


1140.0    688
1060.0    624
980.0     591
1200.0    560
900.0      69
1100.0     23
1170.0      8
950.0       5
Name: temp, dtype: int64

In [9]:
# set up machine learning model environment
s = setup(df[df['temp'] == 1140], target='t60', ignore_features=[
          'e_cd', 'produit', 't70', 'nech', 'temp'], train_size=0.8)


Unnamed: 0,Description,Value
0,Session id,567
1,Target,t60
2,Target type,Regression
3,Data shape,"(688, 29)"
4,Train data shape,"(550, 29)"
5,Test data shape,"(138, 29)"
6,Ignore features,5
7,Numeric features,27
8,Categorical features,1
9,Rows with missing values,100.0%


In [10]:
# compare regression models
best = compare_models(turbo=True)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,3.4101,34.5042,5.6165,0.4434,0.5154,0.7387,0.074
catboost,CatBoost Regressor,3.4466,34.5569,5.6329,0.44,0.5257,0.7489,0.941
rf,Random Forest Regressor,3.6157,36.9488,5.8445,0.3928,0.5621,0.8906,0.12
gbr,Gradient Boosting Regressor,3.7408,38.6837,5.9884,0.3669,0.5599,0.8615,0.062
lightgbm,Light Gradient Boosting Machine,3.7389,38.9956,6.0012,0.3641,0.5736,0.8087,0.274
xgboost,Extreme Gradient Boosting,3.6619,39.0322,6.0182,0.3633,0.5532,0.7679,0.3
knn,K Neighbors Regressor,4.0643,44.51,6.4458,0.2596,0.6099,0.9085,0.016
br,Bayesian Ridge,4.452,49.4019,6.8021,0.2088,0.6657,1.1516,0.014
dt,Decision Tree Regressor,4.2036,48.6791,6.7577,0.1942,0.6303,0.8258,0.015
huber,Huber Regressor,4.2727,51.7845,6.9752,0.178,0.6285,0.8952,0.02


In [11]:
# show statistics for model with best R2
evaluate_model(best)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…