In [10]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
import re
import sys

# Load the dataset from the parquet file
input_file = 'final_dataset.parquet'
output_file = 'final_dataset_with_scores.csv'

try:
    df = pd.read_parquet(input_file)
except FileNotFoundError:
    print(f"Error: '{input_file}' not found. Please ensure the file is in the same directory as the script.")
    sys.exit(1)

# --- Feature Engineering ---

# 1. Review length: A proxy for irrelevant content and rants.
df['review_length'] = df['text'].astype(str).str.len()

# 2. Number of reviews by user: A proxy for identifying non-visitors or new users.
df['num_of_reviews_by_user'] = df.groupby('user_id')['text'].transform('count')

# 3. Presence of ad keywords: A direct flag for advertisements.
# The regex looks for common patterns like "http://", "https://", "www.", and a common phone number format.
ad_regex = r'(http|www\.|tel:|\d{3}[-\s]\d{3}[-\s]\d{4})'
df['has_ad_keywords'] = df['text'].astype(str).apply(
    lambda x: 1 if re.search(ad_regex, x, re.I) else 0
)

# 4. Sentiment Polarity: Identifies extremely negative or positive reviews.
df['sentiment_polarity'] = df['text'].astype(str).apply(
    lambda x: TextBlob(x).sentiment.polarity
)

# 5. Star Rating: The original rating provided by the user (assuming it's in the data).
# The EDA notebook suggests this exists in the original dataset.
# We will use this column directly. We will fill any missing values just in case.
df['star_rating'] = df['rating'].fillna(0)

# Define the final features for the model
features = [
    'review_length',
    'num_of_reviews_by_user',
    'has_ad_keywords',
    'sentiment_polarity',
    'star_rating'
]

# Use a temporary DataFrame for scaling to avoid modifying the original
X = df[features].copy()

# --- Data Preprocessing ---

# Scale the features. This is crucial for models that rely on distance, and good practice for Isolation Forest.
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# --- Model Building ---

# Initialize the Isolation Forest model
# 'contamination' is a hyperparameter that can be tuned.
model = IsolationForest(contamination=0.05, random_state=42)

# Fit the model and get the predictions.
# A prediction of -1 indicates an outlier, and 1 indicates an inlier.
df['is_outlier'] = model.fit_predict(X_scaled)

# Get the anomaly score. The lower the score, the more anomalous the point.
df['anomaly_score'] = model.decision_function(X_scaled)

# --- Save the Output ---

# Save the DataFrame with all the new features and scores to a new Parquet file.
df.to_csv(output_file, index=False)