In [None]:
# Import necessary libraries for data processing, analysis, and graph operations
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computations
import networkx as nx  # Graph-based analysis and network operations
from textblob import TextBlob  # Sentiment analysis and text processing
import textstat  # Readability and text complexity metrics
from datetime import timedelta  # Time-based calculations

# Load the preprocessed dataset
file_path = 'data/reviews_dataset_cleaned.csv'
reviews_dataset = pd.read_csv(file_path)

print("Dataset loaded successfully.")  # Confirm successful loading


In [None]:
# Text-Based Feature Engineering

# Calculate the number of words in each review
reviews_dataset['text_length'] = reviews_dataset['text'].apply(lambda x: len(x.split()))

# Ensure the 'cleaned_text' column contains only valid strings (fill NaN values with an empty string)
reviews_dataset['cleaned_text'] = reviews_dataset['cleaned_text'].fillna('').astype(str)

# Compute sentiment polarity score using TextBlob
reviews_dataset['sentiment'] = reviews_dataset['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Compute readability score using the Flesch-Kincaid grade level
try:
    reviews_dataset['readability'] = reviews_dataset['text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
except Exception as e:
    print(f"Error in readability calculation: {e}")
    reviews_dataset['readability'] = None  # Assign None if an error occurs

# Create an interaction feature by multiplying text length with readability score
reviews_dataset['text_length_x_readability'] = reviews_dataset['text_length'] * reviews_dataset['readability']

# Apply log transformation to text length for better feature scaling
reviews_dataset['log_text_length'] = np.log1p(reviews_dataset['text_length'])

# Compute the ratio of text length to sentiment score (adding 1 to avoid division by zero)
reviews_dataset['length_sentiment_ratio'] = reviews_dataset['text_length'] / (np.abs(reviews_dataset['sentiment']) + 1)

print("Text-based feature engineering completed.")


In [None]:
# Temporal Feature Engineering

# Convert the 'date' column to datetime format, handling any errors by coercing invalid values to NaT
reviews_dataset['date'] = pd.to_datetime(reviews_dataset['date'], errors='coerce')

# Compute the number of days since the user's last review
reviews_dataset['days_since_last_review'] = reviews_dataset.groupby('user_id')['date'].diff().dt.days.fillna(0)

# Extract key temporal features
reviews_dataset['is_weekend'] = reviews_dataset['date'].dt.dayofweek >= 5  # Boolean flag: True if the review was posted on a weekend
reviews_dataset['month'] = reviews_dataset['date'].dt.month  # Extract the month from the review date

# Identify spike days based on daily review volume
daily_review_counts = reviews_dataset.groupby('date').size().reset_index(name='review_count')  # Count reviews per day
spike_threshold = daily_review_counts['review_count'].mean() + 2 * daily_review_counts['review_count'].std()  # Define threshold for spike days
spike_days = daily_review_counts[daily_review_counts['review_count'] > spike_threshold]['date']  # Identify spike days

# Compute the number of unique reviewers per day (spike-related feature)
reviews_dataset['spike_day_reviewers'] = reviews_dataset.groupby('date')['user_id'].transform('nunique')

print("Temporal feature engineering completed.")


In [None]:
# Rolling Averages and Count-Based Features

# Compute the rolling count of reviews per user over the last 7 days
reviews_dataset['rolling_review_count'] = (
    reviews_dataset.groupby('user_id')['date']
    .transform(lambda x: x.diff().dt.days.rolling(window=7, min_periods=1).count())
)

# Compute the rolling average rating per user over their last 7 reviews
reviews_dataset['rolling_rating_mean'] = (
    reviews_dataset.groupby('user_id')['rating']
    .transform(lambda x: x.rolling(window=7, min_periods=1).mean())
)

print("Rolling averages and count-based features completed.")


In [None]:
# Behavioral Feature Engineering

# Calculate the average rating given by each user
user_avg_rating = reviews_dataset.groupby('user_id')['rating'].mean().reset_index(name='avg_rating')

# Merge the user-specific average rating back into the main dataset
reviews_dataset = reviews_dataset.merge(user_avg_rating, on='user_id', how='left')

# Compute the rating deviation: absolute difference between a user's rating and their average rating
reviews_dataset['rating_deviation'] = np.abs(reviews_dataset['rating'] - reviews_dataset['avg_rating'])

print("Behavioral feature engineering completed.")


In [None]:
# Group Dynamics Feature Engineering

if 'category' in reviews_dataset.columns:
    # Construct a graph where nodes represent users and categories, and edges indicate user-category relationships
    edges = reviews_dataset[['user_id', 'category']].drop_duplicates().values
    review_graph = nx.Graph()
    review_graph.add_edges_from(edges)

    # Compute degree centrality (measures the importance of a user based on their connections in the network)
    degree_centrality = nx.degree_centrality(review_graph)
    
    # Assign computed degree centrality values to users (default to 0 if not found in the graph)
    reviews_dataset['degree_centrality'] = reviews_dataset['user_id'].apply(lambda x: degree_centrality.get(x, 0))
else:
    print("Column 'category' not found. Skipping group dynamics features.")
    
    # Assign None to the feature if the category column is missing
    reviews_dataset['degree_centrality'] = None

print("Group dynamics features completed.")


In [None]:
# Print column names and the number of columns
print(f"Final dataset contains {reviews_dataset.shape[1]} columns.")
print("Columns in the final dataset:", reviews_dataset.columns.tolist())

In [None]:
# Save the Final Dataset
file_path = 'data/reviews_with_features.csv'
reviews_dataset.to_csv(file_path, index=False)

print(f"Final dataset with engineered features saved successfully to: {file_path}")