In [18]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
{Geochemical Clustering - Notebook 3
Geochemical cluster analysis with the Affinity propagation
algorithm. Composing of the final cluster dataset.}

{INTERNAL USE ONLY}
"""

__author__ = '{Malte Schade}'
__copyright__ = 'Copyright {2022}, {Geochemical Clustering - Notebook 3}'
__version__ = '{1}.{1}.{0}'
__maintainer__ = '{Malte Schade}'
__email__ = '{contact@malteschade.com}'
__status__ = '{FINISHED}'

# other modules
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import AffinityPropagation
from joblib import dump, load

# constants
IN_PATH = 'all_prep.csv'
OUT_PATH = 'out_5.csv'
MODEL_PATH = 'model_all.joblib'
TRAIN_NEW = False
MIN_CLUSTER_SIZE = 5
CHEM_COLS = ['mgo', 'sio2', 'fe2o3', 'al2o3', 's', 'mno']
LOC_COLS = ['key', 'cat', 'alias', 'for_type',
            'dpasse', 'fpasse', 'x1', 'y1', 'z1']

# settings
pd.set_option('display.max_columns', None)


In [19]:
# read subset of csv into df
df = pd.read_csv(IN_PATH, index_col=0).reset_index().dropna(
    axis=0, subset=LOC_COLS+CHEM_COLS)

# create two dfs with chemical and location data
df_loc, df_chem = df[LOC_COLS].copy(), df[CHEM_COLS].copy()


In [20]:
# display location data
df_loc


Unnamed: 0,key,cat,alias,for_type,dpasse,fpasse,x1,y1,z1
0,TQC4825,LN,2019_B11_R4_D11,BLHL,0.00,9.10,43263.374000,60619.664000,-0.451000
1,BIZ136,LN,BIZ136,CORE,91.00,93.00,43338.738139,60572.138242,-53.224460
2,TQC4385,LN,2018_B109_R3_C4,BLHL,0.00,6.99,43249.905000,60473.692000,-1.501000
3,BIZ008,LN,BIZ008,CORE,128.75,130.83,43073.911147,61089.145321,-55.897404
4,BIZ146,LN,BIZ146,CORE,57.50,59.50,42927.192991,60483.098154,-41.066075
...,...,...,...,...,...,...,...,...,...
15825,TQC4973,LN,2019_B24_R3_C2,BLHL,0.00,9.87,43068.654000,60668.521000,-0.063000
15826,TQC5204,LN,2019_B46_R3_C1,BLHL,0.00,9.60,43263.491000,60654.701000,-0.201000
15827,TQC0790,LN,2016_B40_Pin_2,BLHL,0.00,3.74,43046.738300,60493.957500,6.874900
15828,BIZ135,LN,BIZ135,CORE,18.60,21.00,43095.474043,60708.153652,-15.654195


In [21]:
# display chemical data
df_chem


Unnamed: 0,mgo,sio2,fe2o3,al2o3,s,mno
0,0.68,0.16,0.026,0.07,0.005,0.0049
1,0.40,0.18,0.031,0.07,0.005,0.0098
2,0.40,0.08,0.019,0.04,0.001,0.0033
3,0.29,0.07,0.090,0.03,0.002,0.0078
4,0.63,0.13,0.051,0.07,0.017,0.0083
...,...,...,...,...,...,...
15825,0.33,0.12,0.027,0.07,0.000,0.0027
15826,0.86,0.24,0.040,0.11,0.007,0.0052
15827,0.33,0.10,0.040,0.04,0.003,0.0031
15828,0.58,0.16,0.045,0.06,0.002,0.0035


In [22]:
# scale chemical data
scaler = RobustScaler()
chem_scaled = scaler.fit_transform(df_chem)

# train new clustering model or load an existing one
if TRAIN_NEW == True:
    chem_model = AffinityPropagation(
        random_state=0, damping=0.5, convergence_iter=20, max_iter=200, verbose=True)
    chem_model.fit(chem_scaled)
    dump(chem_model, MODEL_PATH)

else:
    chem_model = load(MODEL_PATH)

# label the datapoints with the calculated cluster number
df_chem['chem_cluster'] = chem_model.labels_

# restore original value scale (before scaling)
df_chem_centers = pd.DataFrame(scaler.inverse_transform(
    chem_model.cluster_centers_), columns=CHEM_COLS).reset_index().rename(columns={'index': 'chem_cluster'})

# calculate information about cluster centers
df_chem_centers = pd.concat([pd.Series(
    chem_model.cluster_centers_indices_, name='index_center'), df_chem_centers], axis=1)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [23]:
# merge location and chemical datasets
df_results = pd.concat([df_loc, df_chem], axis=1)

# create column with number of points in each cluster
df_results['chem_n'] = df_results.groupby(
    'chem_cluster')['chem_cluster'].transform('count')

# add information about cluster centers to each datapoint
df_results = df_results.join(df_chem_centers, how='left', on='chem_cluster',
                             rsuffix='_center').drop(columns=['chem_cluster_center'])

# filter clusters with to few points
df_results = df_results[df_results['chem_n'] >= MIN_CLUSTER_SIZE].copy()

# sort df by cluster index
df_results.sort_values(by=['chem_cluster'], inplace=True)

df_results


Unnamed: 0,key,cat,alias,for_type,dpasse,fpasse,x1,y1,z1,mgo,sio2,fe2o3,al2o3,s,mno,chem_cluster,chem_n,index_center,mgo_center,sio2_center,fe2o3_center,al2o3_center,s_center,mno_center
15641,TQC5592,LN,2019_B85_R3_C5,BLHL,0.00,10.26,43024.320000,60587.736000,-9.874000,0.35,0.12,0.047,0.06,0.006,0.0024,1,69,45,0.44,0.17,0.050,0.07,0.006,0.0030
10462,TQC0868,LN,2016_B45_P2_22,BLHL,0.00,15.30,42996.598000,60600.712000,12.650000,0.42,0.20,0.048,0.09,0.006,0.0030,1,69,45,0.44,0.17,0.050,0.07,0.006,0.0030
12496,TQC0415,LN,2016_B6_R10_5,BLHL,0.00,10.14,43061.135000,60440.552000,10.069000,0.56,0.15,0.050,0.05,0.008,0.0029,1,69,45,0.44,0.17,0.050,0.07,0.006,0.0030
9933,BIZ071,LN,BIZ071,DERC,27.00,29.00,43044.000000,60598.000000,-22.490000,0.39,0.25,0.044,0.07,0.006,0.0027,1,69,45,0.44,0.17,0.050,0.07,0.006,0.0030
7345,BIZ154,LN,BIZ154,CORE,36.00,38.00,42855.505687,60588.746097,-15.507142,0.51,0.17,0.040,0.08,0.005,0.0032,1,69,45,0.44,0.17,0.050,0.07,0.006,0.0030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5329,BIZ008,LN,BIZ008,CORE,104.96,106.74,43091.144268,61082.872978,-40.509068,0.58,0.05,0.020,0.01,0.004,0.0031,916,128,15766,0.65,0.10,0.023,0.04,0.002,0.0022
5427,TQC1793,LN,2017_B46_R4_26,BLHL,0.00,11.56,43017.552000,60640.370300,10.778800,0.64,0.08,0.029,0.04,0.003,0.0030,916,128,15766,0.65,0.10,0.023,0.04,0.002,0.0022
14131,TQC2793,LN,2017_B118_R1_A11,BLHL,0.00,8.02,43101.350200,60519.995200,-0.989700,0.61,0.08,0.022,0.04,0.000,0.0020,916,128,15766,0.65,0.10,0.023,0.04,0.002,0.0022
9114,BIZ119A,LN,BIZ119A,CORE,80.00,81.80,43659.584433,61683.891013,-14.657518,0.56,0.04,0.017,0.02,0.003,0.0021,916,128,15766,0.65,0.10,0.023,0.04,0.002,0.0022


In [24]:
# save df to csv
df_results.to_csv(OUT_PATH)
