In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import argparse
np.random.seed(111)

### File input

In [5]:
# input_path is path to file with all beads x features in an array
# features = {'barcode','x','y'}
input_path = 'coords.csv'
coords = pd.read_csv(input_path,index_col=0)
all_coords = np.array(coords[['x','y']])

### Isolation forest protocol

In [None]:
# define isolation forest parameters
# see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html for details
max_samples = 50000
contamination = 0.0004
rng = np.random.RandomState(42)
n_jobs = -1
bootstrap = True

In [None]:
# fit isolation forest model and predict outliers
clf = IsolationForest(max_samples=max_samples, random_state=rng,contamination=contamination,n_jobs=n_jobs,bootstrap=bootstrap)
clf.fit(all_coords)
y_pred = clf.predict(all_coords)

### Visualize anomaly predictions

In [None]:
# store outlier classification per bead (-1 labels outliers, 1 labels inliers)
coords['isolation_pred'] = y_pred

# visualize outliers/inliers across 2-D space
outliers = coords[coords['isolation_pred']==-1]
inliers = coords[coords['isolation_pred']==1]
plt.figure(figsize=(10,10))
plt.scatter(inliers['x'],inliers['y'],s=3,c='b')
plt.scatter(outliers['x'],outliers['y'],s=3,c='r')
plt.xlim(0,6000)
plt.ylim(0,6000)
plt.show()

### File output

In [None]:
# save in output_path, path to output file
output_path = 'outlier_predictions.csv'
coords.to_csv(output_path)