In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from scipy.stats import percentileofscore

# Sample DataFrame (replace this with your actual data)
data = {
    'maturity_value': [1000, 2000, 1500, 1200, 1800],
    'category': ['S.A', 'I.T', 'D.A', 'T.H', 'J.K'],
    'currency': ['USD', 'GBP', 'USD', 'JPY', 'GBP'],
    'amount': [500, 800, 600, 700, 900]
}
data = pd.DataFrame(data)

# Encode categorical variables
label_encoders = {}
categorical_vars = ['category', 'currency']
for var in categorical_vars:
    le = LabelEncoder()
    data[var] = le.fit_transform(data[var])
    label_encoders[var] = le

# Parameters for nearest neighbor features
k_neighbors = 3  # Adjust the number of neighbors as needed

# List of variables for which to calculate features
variables_to_process = ['maturity_value', 'category', 'currency', 'amount']

df = data

# Loop through each variable and calculate features
for var in variables_to_process:
    
    # Calculate quantile-based features
    df[f'{var}_quantile_25'] = np.percentile(df[var], 25)
    df[f'{var}_quantile_75'] = np.percentile(df[var], 75)
    
    # Calculate weighted distance features
    df[f'{var}_weighted_distance'] = np.sum(np.sqrt((df[var] - df[var].mean())**2)) / df.shape[0]
    
    # Calculate anomaly score features (using example calculations)
    df[f'{var}_anomaly_score'] = (df[var] - df[var].mean()) / df[var].std()
    df[f'{var}_anomaly_score_rank'] = df[f'{var}_anomaly_score'].rank(ascending=False)
    

# Print the resulting DataFrame with added features
print(df)


   maturity_value  category  currency  amount  maturity_value_quantile_25  \
0            1000         3         2     500                      1200.0   
1            2000         1         0     800                      1200.0   
2            1500         0         2     600                      1200.0   
3            1200         4         1     700                      1200.0   
4            1800         2         0     900                      1200.0   

   maturity_value_quantile_75  maturity_value_weighted_distance  \
0                      1800.0                             320.0   
1                      1800.0                             320.0   
2                      1800.0                             320.0   
3                      1800.0                             320.0   
4                      1800.0                             320.0   

   maturity_value_anomaly_score  maturity_value_anomaly_score_rank  \
0                     -1.212678                                5

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Assuming df is your dataset
data = df.values
variable_names = df.columns  # Get the names of the variables

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # You can adjust the explained variance threshold
data_reduced = pca.fit_transform(data_scaled)

# Get the selected features (principal components)
selected_features = data_reduced

# Get the names of the variables corresponding to the selected principal components
selected_variable_names = variable_names[:len(selected_features)]

# Print the selected variable names
print(selected_variable_names)


Index(['maturity_value', 'category', 'currency', 'amount',
       'maturity_value_quantile_25'],
      dtype='object')


In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed
import numpy as np

# Assuming X is your data and you want to find optimal contamination for Isolation Forest

# Initialize a list to store silhouette scores
silhouette_scores = []

# Contamination values to test
contamination_values = np.arange(0.01, 0.5, 0.01)

def calculate_silhouette_score(contamination, X):
    # Create and fit the Isolation Forest model
    model = IsolationForest(contamination=contamination, n_estimators=100, max_samples=0.5)
    model.fit(X)
    
    # Predict anomalies
    predictions = model.predict(X)
    
    # Calculate the silhouette score
    return silhouette_score(X, predictions)

# Use Parallel to calculate silhouette scores in parallel
num_jobs = -1  # Number of CPU cores to use, -1 uses all available cores
silhouette_scores = Parallel(n_jobs=num_jobs)(delayed(calculate_silhouette_score)(contamination, X) for contamination in contamination_values)

# Find the optimal contamination value that maximizes the silhouette score
optimal_contamination = contamination_values[np.argmax(silhouette_scores)]

print("Optimal Contamination:", optimal_contamination)


In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Assuming df is your dataset
data = df.values

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Define the architecture of the Autoencoder
input_dim = data_scaled.shape[1]
encoding_dim = 32  # You can adjust this based on your needs

input_layer = tf.keras.layers.Input(shape=(input_dim,))
encoder_layer = tf.keras.layers.Dense(encoding_dim, activation='relu')(input_layer)
decoder_layer = tf.keras.layers.Dense(input_dim, activation='sigmoid')(encoder_layer)

autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder_layer)

# Compile the Autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the Autoencoder using parallel processing
autoencoder.fit(data_scaled, data_scaled, epochs=5, batch_size=64, shuffle=True, use_multiprocessing=True, workers=-1)

# Use the trained Autoencoder for anomaly detection
encoded_data = autoencoder.predict(data_scaled)
mse = np.mean(np.power(data_scaled - encoded_data, 2), axis=1)

# Calculate the optimal threshold using quantiles
optimal_quantile = 95  # You can adjust this based on your needs
threshold = np.percentile(mse, optimal_quantile)

# Identify anomalies
anomalies = mse > threshold

# Add an "anomaly" column to the original DataFrame
df['anomaly'] = anomalies

# Print the DataFrame with the added "anomaly" column
print(df)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
   maturity_value  category  currency  amount  maturity_value_quantile_25  \
0            1000         3         2     500                      1200.0   
1            2000         1         0     800                      1200.0   
2            1500         0         2     600                      1200.0   
3            1200         4         1     700                      1200.0   
4            1800         2         0     900                      1200.0   

   maturity_value_quantile_75  maturity_value_weighted_distance  \
0                      1800.0                             320.0   
1                      1800.0                             320.0   
2                      1800.0                             320.0   
3                      1800.0                             320.0   
4                      1800.0                             320.0   

   maturity_value_anomaly_score  maturity_value_anomaly_score_rank  \
0             

In [12]:
df['anomaly'].value_counts()

False    4
True     1
Name: anomaly, dtype: int64