## Install and Import Necessary Packages

In [4]:
# pip install pandas 
# pip install 
# pip install numpy 
# pip install scikit-learn 
# pip install scipy 
# pip install torch

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## Loading of data

Read the cleaned dataset: final-dataset.parquet

In [12]:
file_path = 'final_dataset.parquet'

if not os.path.exists(file_path):
	display(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
	exit() 
else:
	try:
		df = pd.read_parquet(file_path)
		display(f"Successfully loaded '{file_path}'. Shape: {df.shape}")
		print(f"Number of rows (lines): {df.shape[0]}")
		display("First 5 rows of the dataset:")
		display(df.head())
	except Exception as e:
		display(f"Error loading Parquet file: {e}")
		exit() 

"Successfully loaded 'final_dataset.parquet'. Shape: (26622, 36)"

Number of rows (lines): 26622


'First 5 rows of the dataset:'

Unnamed: 0,user_id,name_review_user,time,rating,text,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,price,state,category_main,cat_American restaurant,cat_Coffee shop,cat_Department store,cat_Fast food restaurant,cat_Grocery store,cat_Hotel,cat_Mexican restaurant,cat_Other,cat_Pizza restaurant,cat_Restaurant,cat_Shopping mall,price_encoded,year,month,weekday,hour,text_len,log_num_reviews,rating_deviation,sentiment_polarity,sentiment_subjectivity,excessive_exclaim
0,107813727678662505612,Felicia Wall,1575939675467,5,Coffee is great and so is the service,0x56c67c7da439c267:0x72f12aacd98f7f2e,60.561418,-151.285291,[Cafe],4.6,72,,Open ⋅ Closes 3PM,Cafe,False,False,False,False,False,False,False,True,False,False,False,0,2019,12,1,1,37,4.290459,0.4,0.8,0.75,False
1,103473424444871906765,Garry Hanley,1602702499311,5,Easy in and out experience. Reasonable pricing for the convenience. We use them for most all our DMV needs.,0x56c899de8ee5d2a5:0xb15f3c48a1dc94f0,61.137792,-149.836917,"[Department of motor vehicles, Auto tag agency, State government office]",4.2,114,,Closed ⋅ Opens 9AM,Department of motor vehicles,False,False,False,False,False,False,False,True,False,False,False,0,2020,10,2,19,109,4.744932,0.8,0.377778,0.644444,False
2,104393976961042472712,Nathan Harrison,1559438084515,4,"Great lunch, service could be better as there wasn't many people there.",0x56c894263c293e31:0x39aa7cb841273597,61.228926,-149.741,"[Restaurant, Brewpub, Pizza restaurant, Sports bar]",3.9,648,$$,,Restaurant,False,False,False,False,False,False,False,False,False,True,False,2,2019,6,6,1,71,6.475433,0.1,0.6,0.583333,False
3,110829207026204377440,Terry Oakes,1578766259352,3,Small portions and do not give you what the menu lists.,0x51325b0542590ed1:0xaba794ec633521ef,64.821774,-147.713617,[Restaurant],4.2,125,,Closed ⋅ Opens 11AM Sun,Restaurant,False,False,False,False,False,False,False,False,False,True,False,0,2020,1,5,18,55,4.836282,-1.2,-0.25,0.4,False
4,108158723955787868825,Lee Petersen,1501725448031,4,"Nice gym, what I expect from a Planet Fitness. Bathrooms and locker rooms are clean. Good variety of machines. Sometimes a bit crowded, but not to...",0x51325acaa661fa4d:0xa0cf5a8e5d34cbde,64.83565,-147.74205,"[Gym, Fitness center, Personal trainer, Spa and health club, Tanning salon]",4.1,208,,Open 24 hours,Gym,False,False,False,False,False,False,False,True,False,False,False,0,2017,8,3,1,152,5.342334,-0.1,0.241667,0.741667,False


## Data reprocessing

Prepare the data for the autoencoder by defining a list of relevant features. Then, we use `MinMaxScaler` to normalise the data,and then convert it to PyTorch tensors and DataLoaders for efficient batch processing during training.

In [13]:
# Feature Selection 
features = [
	'rating',
	'text_len',
	'rating_deviation',
	'sentiment_polarity',
	'sentiment_subjectivity',
	'excessive_exclaim',
	'avg_rating',
	#'num_of_reviews',
	'log_num_reviews',
	# 'latitude',
	# 'longitude',
	'price_encoded',
	'year',
	'month',
	'weekday',
	'hour',
	'cat_American restaurant',
	'cat_Coffee shop',
	'cat_Department store',
	'cat_Fast food restaurant',
	'cat_Grocery store',
	'cat_Hotel',
	'cat_Mexican restaurant',
	'cat_Other',
	'cat_Pizza restaurant',
	'cat_Restaurant',
	'cat_Shopping mall'
]

# Check if all required features exist in the DataFrame
missing_features = [f for f in features if f not in df.columns]
if missing_features:
	display(f"Error: Missing required features in the dataset: {missing_features}")
	display("Please ensure your 'final_dataset.parquet' contains these columns.")
	features = [f for f in features if f in df.columns] 
	if not features:
		display("No valid features remaining. Exiting.")
		exit()
	else:
		display(f"Proceeding with available features: {features}")

data = df[features].copy()


# Data Preprocessing (Scaling)
scaler = MinMaxScaler()
scaled_data_np = scaler.fit_transform(data) 

# Convert to PyTorch tensors
scaled_data_tensor = torch.tensor(scaled_data_np, dtype=torch.float32)

# Split data into training and validation sets
X_train_tensor, X_val_tensor = train_test_split(scaled_data_tensor, test_size=0.2, random_state=42)

# Create DataLoader for batching
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, X_train_tensor) 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, X_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


## Autoencoder Model Definition (PyTorch)

Define the Autoencoder model using PyTorch's `nn.Module`.

The autoencoder will perform a reconstruction task, mapping an input vector x to an output vector x'. The core principle of anomaly detection with autoencoders is that the model learns to minimise the reconstruction loss (e.g., Mean Squared Error) for the normal data on which it is trained.

Dimensionality of the latent space chosen is 10. This means the model's encoder component is configured to compress the input data—which has 24 features—into a dense, 10-dimensional vector.

In [14]:
# Autoencoder Model Definition (PyTorch)
class Autoencoder(nn.Module):
	def __init__(self, input_dim, encoding_dim):
		super(Autoencoder, self).__init__()
		self.encoder = nn.Sequential(
			nn.Linear(input_dim, encoding_dim * 2),
			nn.LeakyReLU(0.01),  
			nn.Linear(encoding_dim * 2, encoding_dim),
			nn.LeakyReLU(0.01)   
		)
		self.decoder = nn.Sequential(
			nn.Linear(encoding_dim, encoding_dim * 2),
			nn.LeakyReLU(0.01),  
			nn.Linear(encoding_dim * 2, input_dim),
			nn.Sigmoid()
		)
		
	def forward(self, x):
		encoded = self.encoder(x)
		decoded = self.decoder(encoded)
		return decoded

input_dim = X_train_tensor.shape[1]
# Dimensionality of the latent space
encoding_dim = 10 

model = Autoencoder(input_dim, encoding_dim)
display("\nAutoencoder Model Summary (PyTorch):")
display(model) 

# Define Loss function and Optimiser
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

'\nAutoencoder Model Summary (PyTorch):'

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=24, out_features=20, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=20, out_features=10, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
  )
  (decoder): Sequential(
    (0): Linear(in_features=10, out_features=20, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=20, out_features=24, bias=True)
    (3): Sigmoid()
  )
)

## Training the Autoencoder


Train the model by iterating over the dataset for a number of epochs. In each epoch, perform a forward pass to get the reconstructed output, calculate the mean squared error (MSE) as our loss, and then backpropagate the loss to update the model weights.

A validation set is used to monitor the model's performance on unseen data and ensure it's not overfitting.


In [15]:
# Training the Autoencoder
epochs = 50
display(f"\nTraining Autoencoder for {epochs} epochs...")

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
display(f"Using device: {device}")

for epoch in range(epochs):
	model.train() 
	running_loss = 0.0
	for data_batch, _ in train_loader:
		data_batch = data_batch.to(device)

		optimizer.zero_grad() 
		outputs = model(data_batch) 
		loss = criterion(outputs, data_batch) 
		loss.backward() 
		optimizer.step() 

		running_loss += loss.item() * data_batch.size(0)

	train_loss = running_loss / len(train_dataset)

	# Validation phase
	model.eval() 
	val_loss = 0.0
	with torch.no_grad(): 
		for data_batch_val, _ in val_loader:
			data_batch_val = data_batch_val.to(device)
			outputs_val = model(data_batch_val)
			loss_val = criterion(outputs_val, data_batch_val)
			val_loss += loss_val.item() * data_batch_val.size(0)
	val_loss = val_loss / len(val_dataset)

	display(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

display("\nTraining complete.")

'\nTraining Autoencoder for 50 epochs...'

'Using device: cpu'

'Epoch 1/50, Train Loss: 0.0572, Val Loss: 0.0351'

'Epoch 2/50, Train Loss: 0.0266, Val Loss: 0.0186'

'Epoch 3/50, Train Loss: 0.0162, Val Loss: 0.0136'

'Epoch 4/50, Train Loss: 0.0123, Val Loss: 0.0109'

'Epoch 5/50, Train Loss: 0.0104, Val Loss: 0.0094'

'Epoch 6/50, Train Loss: 0.0092, Val Loss: 0.0082'

'Epoch 7/50, Train Loss: 0.0078, Val Loss: 0.0070'

'Epoch 8/50, Train Loss: 0.0068, Val Loss: 0.0062'

'Epoch 9/50, Train Loss: 0.0062, Val Loss: 0.0057'

'Epoch 10/50, Train Loss: 0.0057, Val Loss: 0.0055'

'Epoch 11/50, Train Loss: 0.0054, Val Loss: 0.0051'

'Epoch 12/50, Train Loss: 0.0050, Val Loss: 0.0047'

'Epoch 13/50, Train Loss: 0.0046, Val Loss: 0.0044'

'Epoch 14/50, Train Loss: 0.0044, Val Loss: 0.0042'

'Epoch 15/50, Train Loss: 0.0042, Val Loss: 0.0040'

'Epoch 16/50, Train Loss: 0.0041, Val Loss: 0.0038'

'Epoch 17/50, Train Loss: 0.0040, Val Loss: 0.0038'

'Epoch 18/50, Train Loss: 0.0039, Val Loss: 0.0036'

'Epoch 19/50, Train Loss: 0.0038, Val Loss: 0.0036'

'Epoch 20/50, Train Loss: 0.0037, Val Loss: 0.0036'

'Epoch 21/50, Train Loss: 0.0037, Val Loss: 0.0035'

'Epoch 22/50, Train Loss: 0.0036, Val Loss: 0.0035'

'Epoch 23/50, Train Loss: 0.0036, Val Loss: 0.0034'

'Epoch 24/50, Train Loss: 0.0035, Val Loss: 0.0033'

'Epoch 25/50, Train Loss: 0.0034, Val Loss: 0.0033'

'Epoch 26/50, Train Loss: 0.0034, Val Loss: 0.0032'

'Epoch 27/50, Train Loss: 0.0033, Val Loss: 0.0032'

'Epoch 28/50, Train Loss: 0.0033, Val Loss: 0.0031'

'Epoch 29/50, Train Loss: 0.0032, Val Loss: 0.0030'

'Epoch 30/50, Train Loss: 0.0031, Val Loss: 0.0030'

'Epoch 31/50, Train Loss: 0.0031, Val Loss: 0.0029'

'Epoch 32/50, Train Loss: 0.0030, Val Loss: 0.0029'

'Epoch 33/50, Train Loss: 0.0030, Val Loss: 0.0029'

'Epoch 34/50, Train Loss: 0.0029, Val Loss: 0.0028'

'Epoch 35/50, Train Loss: 0.0029, Val Loss: 0.0028'

'Epoch 36/50, Train Loss: 0.0029, Val Loss: 0.0028'

'Epoch 37/50, Train Loss: 0.0028, Val Loss: 0.0028'

'Epoch 38/50, Train Loss: 0.0028, Val Loss: 0.0027'

'Epoch 39/50, Train Loss: 0.0028, Val Loss: 0.0027'

'Epoch 40/50, Train Loss: 0.0027, Val Loss: 0.0026'

'Epoch 41/50, Train Loss: 0.0027, Val Loss: 0.0026'

'Epoch 42/50, Train Loss: 0.0027, Val Loss: 0.0026'

'Epoch 43/50, Train Loss: 0.0026, Val Loss: 0.0026'

'Epoch 44/50, Train Loss: 0.0026, Val Loss: 0.0025'

'Epoch 45/50, Train Loss: 0.0026, Val Loss: 0.0025'

'Epoch 46/50, Train Loss: 0.0026, Val Loss: 0.0025'

'Epoch 47/50, Train Loss: 0.0025, Val Loss: 0.0025'

'Epoch 48/50, Train Loss: 0.0025, Val Loss: 0.0025'

'Epoch 49/50, Train Loss: 0.0025, Val Loss: 0.0025'

'Epoch 50/50, Train Loss: 0.0025, Val Loss: 0.0024'

'\nTraining complete.'

## 6. Anomaly Detection using Reconstruction Error

fter training, we use the autoencoder to reconstruct the entire dataset. Data points that are difficult for the model to reconstruct will have a high reconstruction error (high MSE). We then use a Z-score to standardize these errors, allowing us to identify statistical outliers as potential anomalies. A threshold (a Z-score of 2.0) is used to flag these anomalies. 

Data points with a Z-score greater than 2.0 are flagged as potential anomalies.

A binary "is_outlier" column is also added to the dataset (-1 if anomaly, 1 if non-anomalyl).

Examples of anomalies are shown.

In [None]:
# Calculate Reconstruction Error (PyTorch) 
model.eval() 
all_reconstructions = []
with torch.no_grad():
	for i in range(0, len(scaled_data_tensor), batch_size):
		batch = scaled_data_tensor[i:i + batch_size].to(device)
		reconstructed_batch = model(batch)
		all_reconstructions.append(reconstructed_batch.cpu().numpy())

reconstructions_np = np.vstack(all_reconstructions)

# Calculate Mean Squared Error (MSE) between original and reconstructed data
reconstruction_errors = np.mean(np.square(scaled_data_np - reconstructions_np), axis=1)

# Z-scoring the Reconstruction Error
z_scored_reconstruction_errors = zscore(reconstruction_errors)

# Add the z-scored errors to the original dataframe
df['ae_reconstruction_error_zscore'] = z_scored_reconstruction_errors

display("\nReconstruction errors calculated and z-scored.")
display("First 10 reviews with their z-scored autoencoder reconstruction error:")
display(df[['ae_reconstruction_error_zscore'] + features].head(10))

# Identify potential anomalies
threshold = 2.0

# Add binary "is_outlier" column (1 if anomaly, 0 if normal)
df['is_outlier'] = np.where(
    df['ae_reconstruction_error_zscore'] > threshold,
    -1,   # outlier
    1     # normal
)
print(df['is_outlier'].value_counts())

anomalies = df[df['ae_reconstruction_error_zscore'] > threshold]
display(f"\nNumber of potential anomalies (z-score > {threshold}): {len(anomalies)}")

# Display more anomalies and all of their original features (For viewing purposes)
if not anomalies.empty:
	print("\nDisplaying the top 20 anomalies with all their features (sorted by anomaly score):")

	pd.set_option('display.max_rows', 50)       
	pd.set_option('display.max_columns', None)   
	pd.set_option('display.width', 1000)          
	pd.set_option('display.max_colwidth', 150)    

	# Sort by the anomaly score to see the most anomalous first and show the top 20
	print(anomalies.sort_values(by='ae_reconstruction_error_zscore', ascending=False).head(20))

else:
	print("\nNo anomalies detected above the specified threshold.")

'\nReconstruction errors calculated and z-scored.'

'First 10 reviews with their z-scored autoencoder reconstruction error:'

Unnamed: 0,ae_reconstruction_error_zscore,rating,text_len,rating_deviation,sentiment_polarity,sentiment_subjectivity,excessive_exclaim,avg_rating,log_num_reviews,price_encoded,year,month,weekday,hour,cat_American restaurant,cat_Coffee shop,cat_Department store,cat_Fast food restaurant,cat_Grocery store,cat_Hotel,cat_Mexican restaurant,cat_Other,cat_Pizza restaurant,cat_Restaurant,cat_Shopping mall
0,-0.42753,5,37,0.4,0.8,0.75,False,4.6,4.290459,0,2019,12,1,1,False,False,False,False,False,False,False,True,False,False,False
1,-0.34777,5,109,0.8,0.377778,0.644444,False,4.2,4.744932,0,2020,10,2,19,False,False,False,False,False,False,False,True,False,False,False
2,-0.271739,4,71,0.1,0.6,0.583333,False,3.9,6.475433,2,2019,6,6,1,False,False,False,False,False,False,False,False,False,True,False
3,0.110246,3,55,-1.2,-0.25,0.4,False,4.2,4.836282,0,2020,1,5,18,False,False,False,False,False,False,False,False,False,True,False
4,-0.498539,4,152,-0.1,0.241667,0.741667,False,4.1,5.342334,0,2017,8,3,1,False,False,False,False,False,False,False,True,False,False,False
5,0.242406,5,123,0.8,0.25,0.8125,False,4.2,5.796058,2,2019,7,2,3,False,False,False,False,False,False,True,False,False,False,False
6,1.246798,1,64,-2.7,-1.0,0.9,False,3.7,6.577861,1,2019,6,4,6,False,False,False,True,False,False,False,False,False,False,False
7,0.34197,5,27,0.3,1.0,0.3,False,4.7,6.023448,0,2019,1,0,4,False,False,False,False,False,False,False,True,False,False,False
8,0.381516,1,137,-3.5,0.3,0.3,False,4.5,4.934474,0,2019,5,6,20,False,False,False,False,False,False,False,True,False,False,False
9,0.163762,5,432,0.3,0.1375,0.617361,False,4.7,6.306275,2,2017,3,0,4,False,False,False,False,False,False,False,True,False,False,False


'\nNumber of potential anomalies (z-score > 2.0): 498'


Displaying the top 20 anomalies with all their features (sorted by anomaly score):
                     user_id   name_review_user           time  rating                                                                                                                                                   text                                gmap_id   latitude   longitude                                                                                                                                               category  avg_rating  num_of_reviews price                    state         category_main  cat_American restaurant  cat_Coffee shop  cat_Department store  cat_Fast food restaurant  cat_Grocery store  cat_Hotel  cat_Mexican restaurant  cat_Other  cat_Pizza restaurant  cat_Restaurant  cat_Shopping mall  price_encoded  year  month  weekday  hour  text_len  log_num_reviews  rating_deviation  sentiment_polarity  sentiment_subjectivity  excessive_exclaim  ae_reconstruction_error_zscore
3759 

## Investigating the Latent Space

To better understand what the autoencoder has learned, we can inspect the latent space. 

Extract the encoded representation for each data point and create a new dataframe.

Then, calculate the correlation between these latent dimensions and the original features. This helped us interpret which original features are most strongly represented in the compressed dimensions of the latent space.

In [17]:
# Investigating the Latent Space
print("\n--- Investigating the Latent Space ---")

model.eval()
with torch.no_grad():
	encoded_data_tensor = model.encoder(scaled_data_tensor.to(device))
	encoded_data_np = encoded_data_tensor.cpu().numpy()

# Create a DataFrame for the encoded dimensions
encoded_df = pd.DataFrame(encoded_data_np, columns=[f'latent_dim_{i+1}' for i in range(encoding_dim)])

# Combine with the original (unscaled) features to make interpretation easier
combined_df = pd.concat([df[features].reset_index(drop=True), encoded_df], axis=1)

# Calculate the correlation matrix
correlation_matrix = combined_df.corr()

# Get the correlation of each latent dimension with the original features
latent_correlations = correlation_matrix[encoded_df.columns].loc[features]

print("\nCorrelation of Latent Dimensions with Original Features:")
print(latent_correlations)
print("\nStandard Deviation of each Latent Dimension:")
print(encoded_df.std())

df.to_parquet('dataset_with_anomaly_scores.parquet')
df.to_csv('dataset_with_anomaly_scores.csv', index=False)


--- Investigating the Latent Space ---

Correlation of Latent Dimensions with Original Features:
                          latent_dim_1  latent_dim_2  latent_dim_3  latent_dim_4  latent_dim_5  latent_dim_6  latent_dim_7  latent_dim_8  latent_dim_9  latent_dim_10
rating                       -0.374856      0.003880     -0.051809      0.264355      0.372527      0.231474      0.120132      0.085328      0.101295       0.145924
text_len                      0.065590     -0.014201     -0.053463     -0.062461     -0.083969     -0.093611     -0.073897      0.012267      0.022774      -0.000820
rating_deviation             -0.342686      0.010688      0.039263      0.235341      0.350311      0.281148      0.157484      0.130731      0.115944       0.028271
sentiment_polarity           -0.178580      0.080542     -0.032236      0.260987      0.360033      0.281887      0.190974     -0.290620     -0.322829       0.116020
sentiment_subjectivity        0.059121      0.137678      0.037369      