In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Load Data
file_path = 'final_dataset.parquet'

if not os.path.exists(file_path):
	display(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
	exit() 
else:
	try:
		df = pd.read_parquet(file_path)
		display(f"Successfully loaded '{file_path}'. Shape: {df.shape}")
		print(f"Number of rows (lines): {df.shape[0]}")
		display("First 5 rows of the dataset:")
		display(df.head())
	except Exception as e:
		display(f"Error loading Parquet file: {e}")
		exit() 

# Feature Selection 
features = [
	'rating',
	'text_len',
	'rating_deviation',
	'sentiment_polarity',
	'sentiment_subjectivity',
	'excessive_exclaim',
	'avg_rating',
	#'num_of_reviews',
	'log_num_reviews',
	# 'latitude',
	# 'longitude',
	'price_encoded',
	'year',
	'month',
	'weekday',
	'hour',
	'cat_American restaurant',
	'cat_Coffee shop',
	'cat_Department store',
	'cat_Fast food restaurant',
	'cat_Grocery store',
	'cat_Hotel',
	'cat_Mexican restaurant',
	'cat_Other',
	'cat_Pizza restaurant',
	'cat_Restaurant',
	'cat_Shopping mall'
]

# Check if all required features exist in the DataFrame
missing_features = [f for f in features if f not in df.columns]
if missing_features:
	display(f"Error: Missing required features in the dataset: {missing_features}")
	display("Please ensure your 'final_dataset.parquet' contains these columns.")
	features = [f for f in features if f in df.columns] 
	if not features:
		display("No valid features remaining. Exiting.")
		exit()
	else:
		display(f"Proceeding with available features: {features}")

data = df[features].copy()


# Data Preprocessing (Scaling)
scaler = MinMaxScaler()
scaled_data_np = scaler.fit_transform(data) 

# Convert to PyTorch tensors
scaled_data_tensor = torch.tensor(scaled_data_np, dtype=torch.float32)

# Split data into training and validation sets
X_train_tensor, X_val_tensor = train_test_split(scaled_data_tensor, test_size=0.2, random_state=42)

# Create DataLoader for batching
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, X_train_tensor) 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, X_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Autoencoder Model Definition (PyTorch)
class Autoencoder(nn.Module):
	def __init__(self, input_dim, encoding_dim):
		super(Autoencoder, self).__init__()
		self.encoder = nn.Sequential(
			nn.Linear(input_dim, encoding_dim * 2),
			nn.LeakyReLU(0.01),  
			nn.Linear(encoding_dim * 2, encoding_dim),
			nn.LeakyReLU(0.01)   
		)
		self.decoder = nn.Sequential(
			nn.Linear(encoding_dim, encoding_dim * 2),
			nn.LeakyReLU(0.01),  
			nn.Linear(encoding_dim * 2, input_dim),
			nn.Sigmoid()
		)
		
	def forward(self, x):
		encoded = self.encoder(x)
		decoded = self.decoder(encoded)
		return decoded

input_dim = X_train_tensor.shape[1]
# Dimensionality of the latent space
encoding_dim = 10 

model = Autoencoder(input_dim, encoding_dim)
display("\nAutoencoder Model Summary (PyTorch):")
display(model) 

# Define Loss function and Optimiser
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the Autoencoder
epochs = 50
display(f"\nTraining Autoencoder for {epochs} epochs...")

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
display(f"Using device: {device}")

for epoch in range(epochs):
	model.train() 
	running_loss = 0.0
	for data_batch, _ in train_loader:
		data_batch = data_batch.to(device)

		optimizer.zero_grad() 
		outputs = model(data_batch) 
		loss = criterion(outputs, data_batch) 
		loss.backward() 
		optimizer.step() 

		running_loss += loss.item() * data_batch.size(0)

	train_loss = running_loss / len(train_dataset)

	# Validation phase
	model.eval() 
	val_loss = 0.0
	with torch.no_grad(): 
		for data_batch_val, _ in val_loader:
			data_batch_val = data_batch_val.to(device)
			outputs_val = model(data_batch_val)
			loss_val = criterion(outputs_val, data_batch_val)
			val_loss += loss_val.item() * data_batch_val.size(0)
	val_loss = val_loss / len(val_dataset)

	display(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

display("\nTraining complete.")

# Calculate Reconstruction Error (PyTorch) 
model.eval() 
all_reconstructions = []
with torch.no_grad():
	for i in range(0, len(scaled_data_tensor), batch_size):
		batch = scaled_data_tensor[i:i + batch_size].to(device)
		reconstructed_batch = model(batch)
		all_reconstructions.append(reconstructed_batch.cpu().numpy())

reconstructions_np = np.vstack(all_reconstructions)

# Calculate Mean Squared Error (MSE) between original and reconstructed data
reconstruction_errors = np.mean(np.square(scaled_data_np - reconstructions_np), axis=1)

# Z-scoring the Reconstruction Error
z_scored_reconstruction_errors = zscore(reconstruction_errors)

# Add the z-scored errors to the original dataframe
df['ae_reconstruction_error_zscore'] = z_scored_reconstruction_errors

display("\nReconstruction errors calculated and z-scored.")
display("First 10 reviews with their z-scored autoencoder reconstruction error:")
display(df[['ae_reconstruction_error_zscore'] + features].head(10))

# Identify potential anomalies
threshold = 2.0
anomalies = df[df['ae_reconstruction_error_zscore'] > threshold]
display(f"\nNumber of potential anomalies (z-score > {threshold}): {len(anomalies)}")

# Display more anomalies and all of their original features (For viewing purposes)
if not anomalies.empty:
	print("\nDisplaying the top 20 anomalies with all their features (sorted by anomaly score):")

	pd.set_option('display.max_rows', 50)       
	pd.set_option('display.max_columns', None)   
	pd.set_option('display.width', 1000)          
	pd.set_option('display.max_colwidth', 150)    

	# Sort by the anomaly score to see the most anomalous first and show the top 20
	print(anomalies.sort_values(by='ae_reconstruction_error_zscore', ascending=False).head(20))

else:
	print("\nNo anomalies detected above the specified threshold.")

# Investigating the Latent Space
print("\n--- Investigating the Latent Space ---")

model.eval()
with torch.no_grad():
	encoded_data_tensor = model.encoder(scaled_data_tensor.to(device))
	encoded_data_np = encoded_data_tensor.cpu().numpy()

# Create a DataFrame for the encoded dimensions
encoded_df = pd.DataFrame(encoded_data_np, columns=[f'latent_dim_{i+1}' for i in range(encoding_dim)])

# Combine with the original (unscaled) features to make interpretation easier
combined_df = pd.concat([df[features].reset_index(drop=True), encoded_df], axis=1)

# Calculate the correlation matrix
correlation_matrix = combined_df.corr()

# Get the correlation of each latent dimension with the original features
latent_correlations = correlation_matrix[encoded_df.columns].loc[features]

print("\nCorrelation of Latent Dimensions with Original Features:")
print(latent_correlations)
print("\nStandard Deviation of each Latent Dimension:")
print(encoded_df.std())

"Successfully loaded 'final_dataset.parquet'. Shape: (26622, 36)"

Number of rows (lines): 26622


'First 5 rows of the dataset:'

Unnamed: 0,user_id,name_review_user,time,rating,text,gmap_id,latitude,longitude,category,avg_rating,...,year,month,weekday,hour,text_len,log_num_reviews,rating_deviation,sentiment_polarity,sentiment_subjectivity,excessive_exclaim
0,107813727678662505612,Felicia Wall,1575939675467,5,Coffee is great and so is the service,0x56c67c7da439c267:0x72f12aacd98f7f2e,60.561418,-151.285291,[Cafe],4.6,...,2019,12,1,1,37,4.290459,0.4,0.8,0.75,False
1,103473424444871906765,Garry Hanley,1602702499311,5,Easy in and out experience. Reasonable pricin...,0x56c899de8ee5d2a5:0xb15f3c48a1dc94f0,61.137792,-149.836917,"[Department of motor vehicles, Auto tag agency...",4.2,...,2020,10,2,19,109,4.744932,0.8,0.377778,0.644444,False
2,104393976961042472712,Nathan Harrison,1559438084515,4,"Great lunch, service could be better as there ...",0x56c894263c293e31:0x39aa7cb841273597,61.228926,-149.741,"[Restaurant, Brewpub, Pizza restaurant, Sports...",3.9,...,2019,6,6,1,71,6.475433,0.1,0.6,0.583333,False
3,110829207026204377440,Terry Oakes,1578766259352,3,Small portions and do not give you what the me...,0x51325b0542590ed1:0xaba794ec633521ef,64.821774,-147.713617,[Restaurant],4.2,...,2020,1,5,18,55,4.836282,-1.2,-0.25,0.4,False
4,108158723955787868825,Lee Petersen,1501725448031,4,"Nice gym, what I expect from a Planet Fitness....",0x51325acaa661fa4d:0xa0cf5a8e5d34cbde,64.83565,-147.74205,"[Gym, Fitness center, Personal trainer, Spa an...",4.1,...,2017,8,3,1,152,5.342334,-0.1,0.241667,0.741667,False


'\nAutoencoder Model Summary (PyTorch):'

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=24, out_features=20, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=20, out_features=10, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
  )
  (decoder): Sequential(
    (0): Linear(in_features=10, out_features=20, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=20, out_features=24, bias=True)
    (3): Sigmoid()
  )
)

'\nTraining Autoencoder for 50 epochs...'

'Using device: cpu'

'Epoch 1/50, Train Loss: 0.0554, Val Loss: 0.0372'

'Epoch 2/50, Train Loss: 0.0303, Val Loss: 0.0221'

'Epoch 3/50, Train Loss: 0.0201, Val Loss: 0.0170'

'Epoch 4/50, Train Loss: 0.0156, Val Loss: 0.0137'

'Epoch 5/50, Train Loss: 0.0127, Val Loss: 0.0108'

'Epoch 6/50, Train Loss: 0.0098, Val Loss: 0.0088'

'Epoch 7/50, Train Loss: 0.0084, Val Loss: 0.0077'

'Epoch 8/50, Train Loss: 0.0074, Val Loss: 0.0072'

'Epoch 9/50, Train Loss: 0.0070, Val Loss: 0.0068'

'Epoch 10/50, Train Loss: 0.0067, Val Loss: 0.0064'

'Epoch 11/50, Train Loss: 0.0064, Val Loss: 0.0062'

'Epoch 12/50, Train Loss: 0.0062, Val Loss: 0.0060'

'Epoch 13/50, Train Loss: 0.0060, Val Loss: 0.0058'

'Epoch 14/50, Train Loss: 0.0059, Val Loss: 0.0057'

'Epoch 15/50, Train Loss: 0.0058, Val Loss: 0.0056'

'Epoch 16/50, Train Loss: 0.0057, Val Loss: 0.0054'

'Epoch 17/50, Train Loss: 0.0056, Val Loss: 0.0053'

'Epoch 18/50, Train Loss: 0.0055, Val Loss: 0.0051'

'Epoch 19/50, Train Loss: 0.0053, Val Loss: 0.0051'

'Epoch 20/50, Train Loss: 0.0052, Val Loss: 0.0049'

'Epoch 21/50, Train Loss: 0.0050, Val Loss: 0.0045'

'Epoch 22/50, Train Loss: 0.0046, Val Loss: 0.0042'

'Epoch 23/50, Train Loss: 0.0044, Val Loss: 0.0042'

'Epoch 24/50, Train Loss: 0.0043, Val Loss: 0.0041'

'Epoch 25/50, Train Loss: 0.0042, Val Loss: 0.0040'

'Epoch 26/50, Train Loss: 0.0042, Val Loss: 0.0040'

'Epoch 27/50, Train Loss: 0.0042, Val Loss: 0.0039'

'Epoch 28/50, Train Loss: 0.0041, Val Loss: 0.0039'

'Epoch 29/50, Train Loss: 0.0041, Val Loss: 0.0039'

'Epoch 30/50, Train Loss: 0.0041, Val Loss: 0.0039'

'Epoch 31/50, Train Loss: 0.0041, Val Loss: 0.0039'

'Epoch 32/50, Train Loss: 0.0040, Val Loss: 0.0037'

'Epoch 33/50, Train Loss: 0.0038, Val Loss: 0.0035'

'Epoch 34/50, Train Loss: 0.0036, Val Loss: 0.0034'

'Epoch 35/50, Train Loss: 0.0035, Val Loss: 0.0033'

'Epoch 36/50, Train Loss: 0.0034, Val Loss: 0.0032'

'Epoch 37/50, Train Loss: 0.0034, Val Loss: 0.0032'

'Epoch 38/50, Train Loss: 0.0033, Val Loss: 0.0031'

'Epoch 39/50, Train Loss: 0.0032, Val Loss: 0.0031'

'Epoch 40/50, Train Loss: 0.0032, Val Loss: 0.0030'

'Epoch 41/50, Train Loss: 0.0031, Val Loss: 0.0030'

'Epoch 42/50, Train Loss: 0.0031, Val Loss: 0.0029'

'Epoch 43/50, Train Loss: 0.0031, Val Loss: 0.0029'

'Epoch 44/50, Train Loss: 0.0030, Val Loss: 0.0029'

'Epoch 45/50, Train Loss: 0.0030, Val Loss: 0.0028'

'Epoch 46/50, Train Loss: 0.0030, Val Loss: 0.0028'

'Epoch 47/50, Train Loss: 0.0030, Val Loss: 0.0028'

'Epoch 48/50, Train Loss: 0.0029, Val Loss: 0.0028'

'Epoch 49/50, Train Loss: 0.0029, Val Loss: 0.0028'

'Epoch 50/50, Train Loss: 0.0029, Val Loss: 0.0028'

'\nTraining complete.'

'\nReconstruction errors calculated and z-scored.'

'First 10 reviews with their z-scored autoencoder reconstruction error:'

Unnamed: 0,ae_reconstruction_error_zscore,rating,text_len,rating_deviation,sentiment_polarity,sentiment_subjectivity,excessive_exclaim,avg_rating,log_num_reviews,price_encoded,...,cat_Coffee shop,cat_Department store,cat_Fast food restaurant,cat_Grocery store,cat_Hotel,cat_Mexican restaurant,cat_Other,cat_Pizza restaurant,cat_Restaurant,cat_Shopping mall
0,-0.3968,5,37,0.4,0.8,0.75,False,4.6,4.290459,0,...,False,False,False,False,False,False,True,False,False,False
1,-0.351315,5,109,0.8,0.377778,0.644444,False,4.2,4.744932,0,...,False,False,False,False,False,False,True,False,False,False
2,6.648789,4,71,0.1,0.6,0.583333,False,3.9,6.475433,2,...,False,False,False,False,False,False,False,False,True,False
3,0.09569,3,55,-1.2,-0.25,0.4,False,4.2,4.836282,0,...,False,False,False,False,False,False,False,False,True,False
4,-0.399781,4,152,-0.1,0.241667,0.741667,False,4.1,5.342334,0,...,False,False,False,False,False,False,True,False,False,False
5,-0.164909,5,123,0.8,0.25,0.8125,False,4.2,5.796058,2,...,False,False,False,False,False,True,False,False,False,False
6,0.272322,1,64,-2.7,-1.0,0.9,False,3.7,6.577861,1,...,False,False,True,False,False,False,False,False,False,False
7,-0.298887,5,27,0.3,1.0,0.3,False,4.7,6.023448,0,...,False,False,False,False,False,False,True,False,False,False
8,-0.390165,1,137,-3.5,0.3,0.3,False,4.5,4.934474,0,...,False,False,False,False,False,False,True,False,False,False
9,-0.024678,5,432,0.3,0.1375,0.617361,False,4.7,6.306275,2,...,False,False,False,False,False,False,True,False,False,False


'\nNumber of potential anomalies (z-score > 2.0): 713'


Displaying the top 20 anomalies with all their features (sorted by anomaly score):
                     user_id name_review_user           time  rating                                                                                                                                                   text                                gmap_id   latitude   longitude                                                                                                                                               category  avg_rating  num_of_reviews price                      state         category_main  cat_American restaurant  cat_Coffee shop  cat_Department store  cat_Fast food restaurant  cat_Grocery store  cat_Hotel  cat_Mexican restaurant  cat_Other  cat_Pizza restaurant  cat_Restaurant  cat_Shopping mall  price_encoded  year  month  weekday  hour  text_len  log_num_reviews  rating_deviation  sentiment_polarity  sentiment_subjectivity  excessive_exclaim  ae_reconstruction_error_zscore
7610 