# Isolation forest Model 
## Import Necessary Packages

In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
import re
import sys

### Importing our processed dataset and preparing for model 

In [2]:
df = pd.read_csv('final-dataset.csv')

# Define the features to be used in the model
features = [
    'rating',
    'text_len',
    'rating_deviation',
    'sentiment_polarity',
    'sentiment_subjectivity',
    'excessive_exclaim',
    'avg_rating',
    'log_num_reviews',
    'price_encoded',
    'year',
    'month',
    'weekday',
    'hour',
    'cat_American restaurant',
    'cat_Coffee shop',
    'cat_Department store',
    'cat_Fast food restaurant',
    'cat_Grocery store',
    'cat_Hotel',
    'cat_Mexican restaurant',
    'cat_Other',
    'cat_Pizza restaurant',
    'cat_Restaurant',
    'cat_Shopping mall'
]

# Check if all required features exist in the DataFrame
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Error: Missing required features in the dataset: {missing_features}")
    print("Please ensure your 'final-dataset.csv' contains these columns.")
    features = [f for f in features if f in df.columns]  # Proceed with available
    if not features:
        print("No valid features remaining. Exiting.")
        sys.exit(1)
    else:
        print(f"Proceeding with available features: {features}")

# Use a temporary DataFrame for scaling to avoid modifying the original
X = df[features].copy()

# Handle potential NaNs in features (fill with mean)
for col in features:
    if X[col].isnull().any():
        X[col] = X[col].fillna(X[col].mean())
        print(f"Filled NaN values in '{col}' with its mean.")

### Data preprocessing
Scale the features. This is crucial for models that rely on distance, and good practice for Isolation Forest.

In [3]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

### Building the Isolation Forest model 


In [4]:
# Initialize the Isolation Forest model
# 'contamination' is a hyperparameter that can be tuned.
model = IsolationForest(contamination=0.05, random_state=42)

# Fit the model and get the predictions.
# A prediction of -1 indicates an outlier, and 1 indicates an inlier.
df['is_outlier'] = model.fit_predict(X_scaled)

# Get the anomaly score. The lower the score, the more anomalous the point.
df['anomaly_score'] = model.decision_function(X_scaled)

### Save the output as a csv

In [5]:
df.to_csv('final_dataset_with_scores.csv', index=False)