In [1]:
import matplotlib
matplotlib.use("Agg")  # Must be the FIRST line

import matplotlib.pyplot as plt


plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
})

# Confirm backend
print("Backend in use:", matplotlib.get_backend())  # must print 'agg'

# import modules
import numpy as np
import seaborn as sns; sns.set()

# import the sklearn modules                                                                                                               
import sklearn
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, RocCurveDisplay, make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.model_selection import cross_val_score


import pickle
import pandas as pd

import os

Backend in use: agg


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Read the data

dfnoise = pd.read_csv('/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/data/O3b/noise_no_catalogue.csv', sep = ',')
dfinj = pd.read_csv('/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/data/O3b/injections.csv', sep = ',')
dfEvents_confident = pd.read_csv('/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/data/O3b-events/GWTCO3b.csv', delimiter = ',')

In [3]:
# Path specifics  
TYPE = "DOUBLE" # Type of coincidence
Ifo = 'HL' # Interferometers
F = 'F_ER' # Features
M = 'M_Best' # Type of model


#Define the features
features_inj = [#'index', 
                'L_snr', 'H_snr','V_snr',#F0
                'amplitude', #F9
                'L_autochi^2_PQ',	'H_autochi^2_PQ', #F0
                'm1', 'm2', #F1
                'mc',
                's1z', 's2z', #F2
                't_dur', #F3
                'nEvents', #F4
                'L_ERw', 'H_ERw', #F5
                'L_phase',	'H_phase', #F6
                'L_effDist', 'H_effDist', #F7
                'Lend_time', 'Hend_time',
                'iFAR',
                'label',
                'gps_time',
            ]

savefig_path = f'/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/O3b-results-figs/{Ifo}_{F}_{M}/'
savedata_path = f'/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/O3b-results-data/{Ifo}_{F}_{M}/'

if(os.path.isdir(savefig_path) == False):
    os.makedirs(savefig_path)
if(os.path.isdir(savedata_path) == False):
    os.makedirs(savedata_path)

print('Plots are stored in ',savefig_path )
print('Data are stored in ', savedata_path)

Plots are stored in  /home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/O3b-results-figs/HL_F_ER_M_Best/
Data are stored in  /home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/O3b-results-data/HL_F_ER_M_Best/


In [5]:
# Create balanced dataset
dfDataset_filtered1 = dfnoise.sample(n=76389, replace=False, random_state=12) # Dataset for Test - Training
#dfDataset_filtered2 = dfDataset_filtered[~dfDataset_filtered.index.isin(dfDataset_filtered1.index)]
dfDataset_filtered2 = dfnoise.drop(dfnoise.index) # Dataset for Validation
#dfDataset_filtered = dfDataset_filtered

# Merge and create the final dataset
dataset = pd.concat([dfDataset_filtered1, dfinj], ignore_index=True)
print(dataset['label'].value_counts())

label
Noise    76389
Inj      76389
Name: count, dtype: int64


In [6]:
# Study the ranking statistics distribution
noise_hist = dataset[dataset['label'] == 'Noise']['amplitude']
inj_hist = dataset[dataset['label'] == 'Inj']['amplitude']

# Define common bin edges (example: 50 bins over combined range)
min_val = min(noise_hist.min(), inj_hist.min())
max_val = max(noise_hist.max(), inj_hist.max())
bins = np.linspace(min_val, max_val, 100)

# Create a new figure explicitly
plt.figure()

# Plot
sns.histplot(noise_hist, bins=bins, alpha=0.7, label='Noise', stat = 'count', element = 'step')
sns.histplot(inj_hist, bins=bins, alpha=0.5, label='Injections', stat = 'count', element = 'step')

plt.legend()
plt.yscale('log')
plt.xlabel('Ranking statistic', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Ranking statistic Noise - Injections Distributions', fontsize=15)

plt.savefig(savefig_path + 'amplitude_distribution.png', dpi=300, bbox_inches='tight')
plt.close()  # Close the figure to avoid accumulation

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


In [7]:
# List of features to plot
exclude_columns = ['label', 'gps_time', 'gps_time_int', 'index', 't_index', 'iFAR ']  # add any others to exclude
features = [col for col in dataset.columns if col not in exclude_columns and dataset[col].dtype in ['float64', 'int64']]

dataset['m_tot'] = dataset['m1'] + dataset['m2']
dataset['s_eff'] = (dataset['m1']*dataset['s1z'] + dataset['m2']*dataset['s2z']) / dataset['m_tot']
dataset['q'] = dataset['m2']/dataset['m1']

In [8]:
# Here we work only with Training - Test dataset

dfDataset = dataset[features_inj]
dfDataset['m_tot'] = dataset['m_tot']
dfDataset['s_eff'] = dataset['s_eff']
dfDataset['q'] = dataset['q']

dfDataset['L_ER'] = np.sqrt(1 - dfDataset['L_ERw']) + 0.3
dfDataset['H_ER'] = np.sqrt(1 - dfDataset['H_ERw']) + 0.3

dfDataset['dphi'] = dfDataset['H_phase'] - dfDataset['L_phase']
dfDataset['dt'] = dfDataset['Hend_time'] - dfDataset['Lend_time']
dfDataset['dD'] = dfDataset['H_effDist'] - dfDataset['L_effDist']
#dfDataset['mc_rotated'] = dfDataset['mc'] / dfDataset['m_tot']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfDataset['m_tot'] = dataset['m_tot']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfDataset['s_eff'] = dataset['s_eff']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfDataset['q'] = dataset['q']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

In [9]:
# Create the X matrix for features and y for the target       
X = dfDataset.drop(columns = ['label','gps_time','V_snr','m_tot', 's_eff',
                              'iFAR', 'amplitude', 'q', 'mc','L_ERw', 'H_ERw',
                              'L_phase', 'H_phase', 'L_effDist', 'H_effDist', 'Lend_time', 'Hend_time'], axis = 1)

#X.index.name = 'INDEX'
print('Training dataset:')
print(X.columns)
print('Labels array:')
y = pd.DataFrame(dfDataset['label'])
print(y.value_counts())

# split the test                                                                                                                           
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.3,
                                                    random_state=23
                                                    )

Training dataset:
Index(['L_snr', 'H_snr', 'L_autochi^2_PQ', 'H_autochi^2_PQ', 'm1', 'm2', 's1z',
       's2z', 't_dur', 'nEvents', 'L_ER', 'H_ER', 'dphi', 'dt', 'dD'],
      dtype='object')
Labels array:
label
Inj      76389
Noise    76389
Name: count, dtype: int64


In [11]:
# Uncomment for grid search

"""
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Define the Random Forest model
clf = RandomForestClassifier(random_state=42)

y_train_array = y_train.values.ravel()

# define the grid
param_grid = {
    'n_estimators': [15, 50, 100],
    'criterion': ['entropy', 'gini'],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [1, 5, 10],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt'],
}

# Define metrics function
precision = make_scorer(precision_score)
recall = make_scorer(recall_score)
f1 = make_scorer(f1_score, pos_label='Inj')

# Perform the grid search using F1 score
grid_search_f1 = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1, n_jobs=-1)
grid_search_f1.fit(X_train, y_train_array)

# Get the best parameters and scores
print("Best parameters (F1):", grid_search_f1.best_params_)
print("Best F1 score:", grid_search_f1.best_score_)
"""

'\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import make_scorer, precision_score, recall_score, f1_score\n\n# Define the Random Forest model\nclf = RandomForestClassifier(random_state=42)\n\ny_train_array = y_train.values.ravel()\n\n# define the grid\nparam_grid = {\n    \'n_estimators\': [15, 50, 100],\n    \'criterion\': [\'entropy\', \'gini\'],\n    \'max_depth\': [10, 15, 20],\n    \'min_samples_leaf\': [1, 5, 10],\n    \'min_samples_split\': [2, 5, 10],\n    \'max_features\': [\'sqrt\'],\n}\n\n# Define metrics function\nprecision = make_scorer(precision_score)\nrecall = make_scorer(recall_score)\nf1 = make_scorer(f1_score, pos_label=\'Inj\')\n\n# Perform the grid search using F1 score\ngrid_search_f1 = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1, n_jobs=-1)\ngrid_search_f1.fit(X_train, y_train_array)\n\n# Get the best parameters and scores\nprint("Best parameters (F1):", grid_search_f1.best_params_)\nprint("Best F1 score:", grid_searc

In [10]:
# Define and fit the classifier

clf = RandomForestClassifier(n_estimators = 100,
     #bootstrap=False, # added
     bootstrap=True,
     random_state=52,
     n_jobs =  -1,
     criterion =  'entropy',
     max_features =  'sqrt',
     max_depth =  12,
     #min_impurity_decrease=0.00005, # added
     min_samples_split= 5,
     min_samples_leaf = 1,
     #max_leaf_nodes =  8,
     #ccp_alpha=0.00005,
     #class weight
     #class_weight= {'Noise': 1, 'Inj': 2}
     )

clf.fit(X_train, y_train)
print('Probability classes: ',clf.classes_)                                                                    
y_pred_prob = clf.predict_proba(X_test)

#define the dataframe for the probabilities
dfprobs = pd.DataFrame(y_pred_prob, columns = ['ps', 'pTerr'])
dfprobs.index = X_test.index
print(dfprobs)

#create the dataframe to better manage the data
dfX_train = pd.DataFrame(X_train)
dfy_train = pd.DataFrame(y_train)
dfX_test = pd.DataFrame(X_test)
dfy_test = pd.DataFrame(y_test)

# Create the resulting dataset for test
dfy_pred_prob = pd.DataFrame(y_pred_prob)
test_dataset = pd.merge(dfX_test, dfy_test, left_index=True, right_index=True, how='inner')
#print(len(test_dataset))
#print(test_dataset['label'].value_counts())

# Add final informations
df_final_test = pd.merge(test_dataset, dfprobs, left_index=True, right_index=True, how='inner')
dfRFTriggers = df_final_test
dfRFTriggers['amplitude'] = dataset['amplitude']
dfRFTriggers['iFAR'] = dataset['iFAR']
dfRFTriggers['gps'] = dataset['gps_time']

# Separate noise and injection 
dfRFTriggers_inj = dfRFTriggers[dfRFTriggers['label'] == 'Inj']
dfRFTriggers_noise = dfRFTriggers[dfRFTriggers['label'] == 'Noise']

  clf.fit(X_train, y_train)


Probability classes:  ['Inj' 'Noise']
              ps     pTerr
118522  0.999383  0.000617
85137   0.989543  0.010457
147977  0.999997  0.000003
65697   0.005688  0.994312
3201    0.024163  0.975837
...          ...       ...
6621    0.103349  0.896651
99122   0.944514  0.055486
100555  1.000000  0.000000
7221    0.010434  0.989566
23075   0.009590  0.990410

[45834 rows x 2 columns]


In [11]:
# Plot the ps distribution
# Get both data arrays
ps_noise = dfRFTriggers_noise['ps']
ps_inj = dfRFTriggers_inj['ps']

# Define common bins across both datasets
min_val = min(ps_noise.min(), ps_inj.min())
max_val = max(ps_noise.max(), ps_inj.max())
bins = np.linspace(min_val, max_val, 100)

plt.figure()

# Plot using consistent bins
sns.histplot(ps_noise, bins=bins, alpha=0.5, label='Noise', stat='count', element='step')
sns.histplot(ps_inj, bins=bins, label='Injections', stat='count', element='step')

# Formatting
plt.yscale('log')
plt.legend()
plt.xlabel(r'$p_s$', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title(r'$p_s$ Injections vs Noise Distribution', fontsize=15)
plt.savefig(savefig_path + 'ps_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


In [12]:
# Now we apply the logit transformation - ensure no inf values

# Ensure no inf values
dfRFTriggers_inj['ps_lim'] = np.log(dfRFTriggers_inj['ps'] / (1 - dfRFTriggers_inj['ps']))
dfRFTriggers_noise['ps_lim'] = np.log(dfRFTriggers_noise['ps'] / (1 - dfRFTriggers_noise['ps']))

# Replace inf and -inf with finite max/min values
m1 = dfRFTriggers_inj.loc[np.isfinite(dfRFTriggers_inj['ps_lim']), 'ps_lim'].max()
dfRFTriggers_inj['ps_lim'].replace([np.inf], m1, inplace=True)

m2 = dfRFTriggers_noise.loc[np.isfinite(dfRFTriggers_noise['ps_lim']), 'ps_lim'].max()
min2 = dfRFTriggers_noise.loc[np.isfinite(dfRFTriggers_noise['ps_lim']), 'ps_lim'].min()
dfRFTriggers_noise['ps_lim'].replace([np.inf], m2, inplace=True)
dfRFTriggers_noise['ps_lim'].replace([-np.inf], min2, inplace=True)

# Define common bins over both datasets
combined = pd.concat([dfRFTriggers_inj['ps_lim'], dfRFTriggers_noise['ps_lim']])
bins = np.linspace(combined.min(), combined.max(), 100)

plt.figure()

# Plot histograms
sns.histplot(dfRFTriggers_noise['ps_lim'], bins=bins, label='Noise', stat='count', element='step', alpha=0.5)
sns.histplot(dfRFTriggers_inj['ps_lim'], bins=bins, label='Injections', stat='count', element='step')

# Formatting
plt.yscale('log')
plt.xlabel(r'$\tilde{p}_s$')
plt.ylabel('Occurrencies')
plt.legend()
plt.title(r'Distribution of $\tilde{p}_s$')
plt.tight_layout()
plt.savefig(savefig_path + 'ps_lim_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRFTriggers_inj['ps_lim'] = np.log(dfRFTriggers_inj['ps'] / (1 - dfRFTriggers_inj['ps']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRFTriggers_noise['ps_lim'] = np.log(dfRFTriggers_noise['ps'] / (1 - dfRFTriggers_noise['ps']))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or 

In [13]:
# Now compute the KDE for the pAstro

from sklearn.neighbors import KernelDensity

values_inj = dfRFTriggers_inj['ps_lim'].values.reshape(-1,1) # create trhe right format for the data

# Choose the binning
binning_noise = 0.6
binning_inj =0.8

kde_inj = KernelDensity(bandwidth=binning_inj, kernel='gaussian') # Define the kde with banwidth and kernel
kde_inj.fit(values_inj) # fit with the values

x = np.linspace(min(dfRFTriggers_inj['ps_lim']), max(dfRFTriggers_inj['ps_lim']), 25).reshape(-1,1) # take some random values from the data

# Estimate the density for the range of values
log_density_inj = kde_inj.score_samples(x) # Comput the log likelihood oe each sample under the model
density_inj = np.exp(log_density_inj)

values_noise = dfRFTriggers_noise['ps_lim'].values.reshape(-1,1) # create trhe right format for the data

kde_noise = KernelDensity(bandwidth=binning_noise, kernel='gaussian') # Define the kde with banwidth and kernel
kde_noise.fit(values_noise) # fit with the values


y = np.linspace(min(dfRFTriggers_noise['ps_lim']), max(dfRFTriggers_noise['ps_lim']), 25).reshape(-1,1) # take some random values from the data

# Estimate the density for the range of values
log_density_noise = kde_noise.score_samples(y) # Comput the log likelihood oe each sample under the model
density_noise = np.exp(log_density_noise)

In [14]:
# Plt the KDE

# Flatten KDE input arrays
x = x.flatten()
y = y.flatten()

# Combine both ps_lim arrays to define shared bin edges
ps_lim_inj = dfRFTriggers_inj['ps_lim']
ps_lim_noise = dfRFTriggers_noise['ps_lim']
combined = np.concatenate([ps_lim_inj, ps_lim_noise])
bins = np.linspace(combined.min(), combined.max(), 25)

# Create figure and twin axes
fig = plt.figure(figsize=(15, 10))
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()

# Plot histograms with shared bins and no auto-legend
sns.histplot(ps_lim_noise, bins=bins, alpha=0.3, ax=ax1, stat="density", color='b', label=None)
sns.histplot(ps_lim_inj, bins=bins, alpha=0.3, ax=ax1, stat="density", color='r', label=None)

# Plot KDEs (lines) with no auto-legend
sns.lineplot(x=x, y=density_inj, color='r', label=None)
sns.lineplot(x=y, y=density_noise, color='b', label=None)

# Manually define legend handles
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

custom_legend = [
    Line2D([0], [0], color='r', lw=2, label='Injections PDF'),
    Line2D([0], [0], color='b', lw=2, label='Noise PDF'),
    Patch(facecolor='r', alpha=0.3, label='Injections (hist)'),
    Patch(facecolor='b', alpha=0.3, label='Noise (hist)'),
]
# Ticks dimension
ax1.tick_params(axis='both', labelsize=25)  # ticks for ax1
ax2.tick_params(axis='both', labelsize=25)  # ticks for ax2

# Axis labels and title
ax1.set_xlabel(r'$\tilde{p}_s$',  fontsize=30)
ax1.set_ylabel('Histogram Density',  fontsize=30)
ax2.set_ylabel('PDF Density',  fontsize=30)
plt.title('PDF via KDE and Histogram',  fontsize=40)

# External single legend
ax1.legend(handles=custom_legend, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=20)

# Save and display
plt.tight_layout()
plt.savefig(savefig_path + 'pdf_final.png', dpi=300, bbox_inches='tight')
plt.close()

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


In [15]:
# Now compute the pAstro - Injections

values = np.array(dfRFTriggers_inj['ps_lim']) # read the ps_lim
ranks = values.reshape(-1, 1) # reshape it
#ranks_list = ranks.flatten().tolist() # flatten to a list

log_score_event = kde_inj.score_samples(ranks) # assign the score
score_event =  np.exp(log_score_event) # p(signal|ps_lim)

log_score_noise = kde_noise.score_samples(ranks)
score_noise = np.exp(log_score_noise) # p(noise|ps_lim)

# priors
Lambda1 = 36
Lambda0 = len(dfnoise)
print('Prior for Noise (RATE 6 months)', len(dfnoise))

# Now compute the pAstro from ps_lim

pAstro_rw_signals =[]
for rank, score_s, score_n in zip(ranks, score_event, score_noise):
   S = Lambda1*score_s
   N = Lambda0*score_n
   value = S/(S + N)
   pAstro_rw_signals.append(value.tolist())

dfRFTriggers_inj['pAstro_rw'] = pAstro_rw_signals

Prior for Noise (RATE 6 months) 129106


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRFTriggers_inj['pAstro_rw'] = pAstro_rw_signals


In [16]:
# Now compute the pAstro - Noise

values = np.array(dfRFTriggers_noise['ps_lim'])
ranks = values.reshape(-1, 1)

log_score_event = kde_inj.score_samples(ranks)
score_event =  np.exp(log_score_event)

log_score_noise = kde_noise.score_samples(ranks)
score_noise = np.exp(log_score_noise)

pAstro_rw_noise =[]
for rank, score_s, score_n in zip(ranks, score_event, score_noise):
   S = Lambda1*score_s
   N = Lambda0*score_n
   value = S/(S + N)
   pAstro_rw_noise.append(value.tolist())

dfscore_event_noise_triggers = pd.DataFrame(score_event)
dfscore_noise_noise_triggers = pd.DataFrame(score_noise)

dfRFTriggers_noise['pAstro_rw'] = pAstro_rw_noise

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfRFTriggers_noise['pAstro_rw'] = pAstro_rw_noise


In [17]:
# Plot the result

plt.figure()
plt.scatter(dfRFTriggers_inj['amplitude'], dfRFTriggers_inj['pAstro_rw'], c = 'darkorange', s = 1, label = 'Injections')
plt.scatter(dfRFTriggers_noise['amplitude'], dfRFTriggers_noise['pAstro_rw'], c = 'b', s = 3, label = 'Noise')
plt.xlabel('Ranking statistic')
plt.ylabel(r'$p_\mathrm{astro}^{ps}$', fontsize = 15)
plt.title(r'$p_\mathrm{astro}^{ps}$ - ranking statistics', fontsize = 15)
plt.legend()
plt.savefig(savefig_path + 'pAstro_ps_amplitude_injections.png', dpi=300, bbox_inches='tight')
plt.legend()
plt.close()

In [18]:
# Define the the RoC function 
def RoC(df1, df2, column, var, label, save = True, thr=0.005, color = 'b'):
    if column not in df1.columns:
        raise ValueError(f"Column '{column}' not found in the DataFrame.")
# Drop missing values and sort unique values of the column                                                                                                       
    data1 = df1[column].dropna()
    data2 = df2[column].dropna()
    #thresholds = np.sort(data.unique())                                                                                                                             
    thresholds = np.arange(min(data1),max(data1), thr)

    # Calculate the cumulative distribution                                                                                                                          
    count_above_threshold1 = [np.sum(data1 > threshold) / len(data1) for threshold in thresholds]
    count_above_threshold2 =  [np.sum(data2 > threshold) / len(data2) for threshold in thresholds]

    # Create a DataFrame for the cumulative distribution                                                                                                             
    cumulative_df = pd.DataFrame({
        'threshold': thresholds,
        'count_above_threshold1': count_above_threshold1,
        'count_above_threshold2': count_above_threshold2
    })

    if save:
        cumulative_df.to_csv(path_data + f'/{var}_RoC_{label}.csv', index=False)

    # Plot the cumulative distribution                                                                                                                               
    plt.plot(cumulative_df['count_above_threshold1'], cumulative_df['count_above_threshold2'], label=label, c = color, linewidth=3.0)
    plt.yscale('log')
    plt.xscale('log')

    return cumulative_df

In [19]:
# Compute the Roc function  
plt.figure()
RoC(dfRFTriggers_noise, dfRFTriggers_inj, 'ps', 'Roc', r'$p_s$', thr = 0.0001, color = 'r', save = False)
RoC(dfRFTriggers_noise,dfRFTriggers_inj, 'amplitude', 'Roc', 'Ranking statistic', thr = 0.005, save = False, color= 'steelblue')
plt.legend()
plt.grid(True, which='both')
plt.ylabel('Nd (density)', fontsize=15)
plt.xlabel(r'$\alpha$', fontsize=15)
plt.savefig(savefig_path + 'RoC_O3a.png', dpi=300, bbox_inches='tight')
plt.close()
print(savefig_path)

/home/lorenzo.mobilia/PhD-Thesis/pastro-o3-mbta-triggers-paper/RF-codes/O3b-results-figs/HL_F_ER_M_Best/


In [20]:
# Now study the pAstro for events in the catalogue

events = dfEvents_confident#dfDataset_events_complete
events['label'] = 'event'

# fill in the added features (if you want to use them)
events['L_ER'] = np.sqrt(1 - events['L_ERw']) + 0.3
events['H_ER'] = np.sqrt(1 - events['H_ERw']) + 0.3
events['dphi'] = events['H_phase'] - events['L_phase']
events['dt'] = events['Hend_time'] - events['Lend_time']
events['dD'] = events['H_effDist'] - events['L_effDist']
events['m_tot'] = events['m1'] + events['m2']
events['s_eff'] = (events['m1']*events['s1z'] + events['m2']*events['s2z']) / events['m_tot']

y_evt = events['label'] 

In [21]:
X_event_test = events.drop(columns = ['Unnamed: 0','label','gps_time', 'mc', 'm_tot', 's_eff','V_snr',
                               'iFAR', 'amplitude',
                              'L_phase', 'H_phase', 'L_effDist', 'H_effDist', 'Lend_time', 'Hend_time','L_ERw', 'H_ERw',
                               'gps_time', 'commonName', 'GPS', 'pAstroMbta', 'GPS_int', 'gps_time_int'], axis = 1)


In [22]:
# Now apply the classifier
y_pred_prob_events = clf.predict_proba(X_event_test)
dfy_pred_prob_events = pd.DataFrame(y_pred_prob_events, columns = ['ps', 'pTerr'])

# Compactify the result
dfX_test_events = pd.DataFrame(X_event_test)
dfy_test_events = pd.DataFrame(y_evt)

test_dataset_events = pd.merge(dfX_test_events, dfy_test_events, left_index=True, right_index=True, how='inner')
dfRFTriggers_events= pd.merge(test_dataset_events, dfy_pred_prob_events, left_index=True, right_index=True, how='inner')

# fill in the dataframe
dfRFTriggers_events.index = dfEvents_confident.index #dfDataset_events_complete.index
dfRFTriggers_events['p_astro'] = dfEvents_confident['pAstroMbta']
dfRFTriggers_events['amplitude'] = dfEvents_confident['amplitude']
dfRFTriggers_events['iFAR'] = dfEvents_confident['iFAR']
dfRFTriggers_events['commonName'] = dfEvents_confident['commonName']

In [23]:
# Now compute the ps_lim for the events
dfRFTriggers_events['ps_lim'] = np.log(dfRFTriggers_events['ps'] / (1 - dfRFTriggers_events['ps']))

m1 = dfRFTriggers_events.loc[dfRFTriggers_events['ps_lim'] != np.inf, 'ps_lim'].max()
dfRFTriggers_events['ps_lim'] = dfRFTriggers_events['ps_lim'].replace([np.inf],m1)

# Compute pAstro for real Triggers

values = np.array(dfRFTriggers_events['ps_lim'])
ranks = values.reshape(-1, 1)
ranks_list = ranks.flatten().tolist()

log_score_event = kde_inj.score_samples(ranks)
score_event =  np.exp(log_score_event)


log_score_noise = kde_noise.score_samples(ranks)
score_noise = np.exp(log_score_noise)


Lambda1 = 36
#Lambda0 = len(noise)
print(len(score_event))
print(len(score_noise))
print(len(ranks_list))


pAstro_rw_signals =[]
for rank, score_s, score_n in zip(ranks, score_event, score_noise):
   S = Lambda1*score_s
   N = Lambda0*score_n
   value = S/(S + N)
   pAstro_rw_signals.append(value.tolist())


dfRFTriggers_events['pAstro_rw'] = pAstro_rw_signals
print(dfRFTriggers_events[['pAstro_rw', 'ps_lim']])

16
16
16
    pAstro_rw    ps_lim
0    0.999997  8.238434
1    0.999997  8.238434
2    0.126283  4.786646
3    0.999997  8.238434
4    0.870005  6.584791
5    0.999997  8.238434
6    0.999997  8.238434
7    0.016826  2.846170
8    0.707564  6.338005
9    0.127727  4.818663
10   0.999997  8.238434
11   0.999997  8.238434
12   0.000234 -0.149033
13   0.999997  8.238434
14   0.999597  7.613505
15   0.001238  1.111607


In [25]:
# Plot the final result

plt.figure()
plt.scatter(dfRFTriggers_events['amplitude'], dfRFTriggers_events['pAstro_rw'], label = r'Events - $p_\mathrm{astro}^{ps}$', s = 55)
plt.scatter(dfRFTriggers_events['amplitude'], dfRFTriggers_events['p_astro'], label = r'Events - $p_\mathrm{astro}$', color = 'r', marker='x', s = 55)
plt.legend()
plt.title(r'$p_\mathrm{astro}^{ps}$ - $p_\mathrm{astro}$ - ranking statistics O3b Events')
plt.xlabel('ranking statistics')
plt.ylabel(r'$p_\mathrm{astro}$')
plt.ylim([-0.1,1.1])
# Plot a horizontal line at y = 25
plt.axhline(y=0.5, color='g', linestyle='-.')
plt.savefig(savefig_path + 'pAstro_ps_amplitude_O3bEvents_hihglight.png', dpi=300, bbox_inches='tight')
plt.close()