In [3]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
#import matplotlib
# matplotlib.use('Agg')
import lightgbm as lgb
from sklearn import metrics
from numpy.ma import MaskedArray
import sklearn.utils.fixes
sklearn.utils.fixes.MaskedArray = MaskedArray
from sklearn.model_selection import train_test_split
#from sklearn.metrics import plot_confusion_matrix
import shap

In [4]:
# **************** Loading the data
# For the example I am usign a table in csv format, but if your data are larger you can use other formats compatible with pandas, like .pickle
print("Start loading data file ")

cols_names=['area', 'perimeter', 'neighbours', 'max neighbour distance',
       'min neighbour distance', 'max vertices distance',
       'min vertices distance', 'max vertices-point distance',
       'min vertices-point distance', 'distance to center', 'activity',
       'particle type']
density=0.008
fa=100
input_file=f"phia{density}/particles-features-{density}-Fa{fa}.txt"
data = pd.read_csv(input_file, delimiter=' ',names=cols_names)

#print(data)


# *** Split the available data in training and test
# the model will be trained only using the training data
# so we can evaluate the performance on a different (test) set, that is new for the ML model
df_train, df_test = train_test_split(data, random_state=50, test_size=0.3)

print('------------ BEGIN TRAIN DATAFRAME COLUMNS ------------------')
print(df_train.columns)
print('------------- END TRAIN DATAFRAME COLUMNS -------------------')


Start loading data file 
------------ BEGIN TRAIN DATAFRAME COLUMNS ------------------
Index(['area', 'perimeter', 'neighbours', 'max neighbour distance',
       'min neighbour distance', 'max vertices distance',
       'min vertices distance', 'max vertices-point distance',
       'min vertices-point distance', 'distance to center', 'activity',
       'particle type'],
      dtype='object')
------------- END TRAIN DATAFRAME COLUMNS -------------------


In [5]:
print("create x/y dataframes (train set)")
# In this particulat proble we want to identify if a specific particle 'is_active' or not
# so we have to do a binary classification of the column 'is_active'
df_train_y = df_train['activity'].copy().astype('int')
df_train.drop(columns='activity', inplace=True)
df_train_x = df_train

print("create x/y dataframes (test set)")
df_test_y = df_test['activity'].copy().astype('int')
df_test.drop(columns='activity', inplace=True)
df_test_x = df_test

create x/y dataframes (train set)
create x/y dataframes (test set)


In [7]:
print('\n\n*** Using a gradient boosting algorithm')
# This is a more advanced algorithm that is explained here:https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
model = lgb.LGBMClassifier(learning_rate=0.01,num_leaves=100,max_depth=15, early_stopping_rounds=10, num_iterations=3000, random_state=42)

model.fit(df_train_x,df_train_y,eval_set=[(df_train_x,df_train_y),(df_test_x,df_test_y)],
            eval_metric='binary')
print('\n\n***Training accuracy {:.4f}'.format(model.score(df_train_x,df_train_y)))
print('***Testing accuracy {:.4f}\n'.format(model.score(df_test_x,df_test_y)))

print(metrics.classification_report(df_test_y,model.predict(df_test_x)))



*** Using a gradient boosting algorithm




[LightGBM] [Info] Number of positive: 11304, number of negative: 1388696
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2304
[LightGBM] [Info] Number of data points in the train set: 1400000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008074 -> initscore=-4.810964
[LightGBM] [Info] Start training from score -4.810964
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[835]	training's binary_logloss: 0.0170565	valid_1's binary_logloss: 0.022454


***Training accuracy 0.9943
***Testing accuracy 0.9939

              precision    recall  f1-score   support

           0       0.99      1.00      1.00    595304
           1       0.74      0.34      0.46      4696

    accuracy                           0.9

In [13]:
pred_test_gb=model.predict(df_test_x)
f1_score(df_test_y,pred_test_gb)

0.46277842907385697

In [6]:
#Returns accuracy,auc,f1_score
def score(y,pred):
    return accuracy_score(y,pred),roc_auc_score(y,pred),f1_score(y,pred)
# score(df_test_y,pred_test_gb)

In [None]:
pred_test=model.predict(df_test_x)
print(f'The accuracy on the test set is {accuracy_score(df_test_y,pred_test_gb)} and the AUC is {roc_auc_score(df_test_y,pred_test_gb)}')

The accuracy on the test set is 0.908095 and the AUC is 0.5705903378676136


# We apply a Decision Tree Method

In [17]:
from sklearn.tree import DecisionTreeClassifier
dTreemodel=DecisionTreeClassifier(criterion='entropy',random_state=0)
dTreemodel.fit(df_train_x,df_train_y)

In [18]:
pred_test_dtree=dTreemodel.predict(df_test_x)

In [19]:
score(df_test_y,pred_test_dtree)

(0.9888316666666667, 0.673667907775452, 0.3313042610517912)

We could prune the tree for reducing its complexity but this won't increase dramatically its accuracy, which is the lowest of all of our methods

In [None]:
import sklearn.model_selection as skm
ccp_path=dTreemodel.cost_complexity_pruning_path(df_train_x,df_train_y)
kfold=skm.KFold(10,random_state=1,shuffle=True)

In [None]:
grid=skm.GridSearchCV(dTreemodel,{'ccp_alpha':ccp_path.ccp_alphas},refit=True,cv=kfold,scoring='accuracy',n_jobs=-2)
grid.fit(df_train_x,df_train_y)
grid.best_score_ 

# We apply now a Random Forest Method

In [10]:
from sklearn.ensemble import RandomForestClassifier
rforestmodel=RandomForestClassifier(n_jobs=-1)
rforestmodel.fit(df_train_x,df_train_y)
pred_test_rforest=rforestmodel.predict(df_test_x)

In [15]:
score(df_test_y,pred_test_rforest)

(0.9937866666666667, 0.6444748450398761, 0.4218362282878412)

# We apply now a SVM (too much execution time)

In [None]:
from sklearn.svm import SVC
SVCmodel=SVC(kernel='rbf',random_state=42)
SVCmodel.fit(df_train_x[['min neighbour distance','min vertices-point distance']],df_train_y)
pred_SVC=SVCmodel.predict(df_test_x['min_neighbour distance','min vertices-point distance'])

# We apply now a ...

# For evaluating if the voronoi parameters are really useful

In [39]:
import seaborn as sns
for col in data.columns:
    sns.kdeplot(data[data['activity'] == 0][col], label="Pasivas", fill=True)
    sns.kdeplot(data[data['activity'] == 1][col], label="Activas", fill=True)
    plt.title(col)
    plt.legend()
    plt.savefig(f"{col}_distribution.png")  # Guarda la imagen
    plt.close()  # Cierra la figura para liberar memoria

  sns.kdeplot(data[data['activity'] == 0][col], label="Pasivas", fill=True)
  sns.kdeplot(data[data['activity'] == 1][col], label="Activas", fill=True)
  plt.legend()
