In [1]:
import pandas as pd
import numpy as np

import re
from textblob import TextBlob
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from textstat import flesch_reading_ease, flesch_kincaid_grade

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('data/data_cleaned.csv')

# Feature Engineering

In [3]:
# --- Basic Text Features ---
df['word_count'] = df['review/text'].apply(lambda x: len(word_tokenize(str(x))))
df['char_count'] = df['review/text'].apply(lambda x: len(str(x)))
df['sentence_count'] = df['review/text'].apply(lambda x: len(sent_tokenize(str(x))))
df['avg_sentence_length'] = df['word_count'] / df['sentence_count']
df['exclamation_count'] = df['review/text'].str.count('!')
df['question_count'] = df['review/text'].str.count(r'\?')
df['capital_count'] = df['review/text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
df['capital_ratio'] = df['capital_count'] / df['char_count']

In [4]:
# --- Sentiment Analysis Using TextBlob (polarity: -1 to 1, subjectivity: 0 to 1) --- 
df['polarity'] = df['review/text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['subjectivity'] = df['review/text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

# --- Readability Metrics using # Flesch Reading Ease --- 
df['flesch_reading_ease'] = df['review/text'].apply(lambda x: flesch_reading_ease(str(x)))
df['flesch_kincaid_grade'] = df['review/text'].apply(lambda x: flesch_kincaid_grade(str(x)))

# --- Specific vs vague language (simple example using pronouns/adverbs) ---
vague_words = ['thing', 'stuff', 'something', 'anything', 'everything']
df['vague_word_count'] = df['review/text'].apply(lambda x: sum(1 for w in str(x).lower().split() if w in vague_words))
df['vague_ratio'] = df['vague_word_count'] / df['word_count']

In [5]:
df.columns

Index(['review/helpfulness', 'review/time', 'review/summary', 'review/text',
       'is_helpful', 'datetime', 'year', 'month', 'day', 'day_of_week',
       'is_weekend', 'quarter', 'word_count', 'char_count', 'sentence_count',
       'avg_sentence_length', 'exclamation_count', 'question_count',
       'capital_count', 'capital_ratio', 'polarity', 'subjectivity',
       'flesch_reading_ease', 'flesch_kincaid_grade', 'vague_word_count',
       'vague_ratio'],
      dtype='object')

### TF-IDF Text Vectorization

In [6]:
tfidf = TfidfVectorizer(max_features=200, ngram_range=(1,2), stop_words='english')
X_tfidf = tfidf.fit_transform(df['review/text'].astype(str))
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out(), index=df.index)

In [7]:
feature_cols = ['word_count', 'char_count', 'sentence_count', 'avg_sentence_length',
                'exclamation_count', 'question_count', 'capital_count', 'capital_ratio',
                'polarity', 'subjectivity', 'flesch_reading_ease', 'flesch_kincaid_grade',
                'vague_word_count', 'vague_ratio', 'month', 'is_weekend']
other_features = df[feature_cols]

X_combined = pd.concat([tfidf_df, other_features], axis=1)

In [8]:
X_combined.to_csv("engineered_features.csv", index=False)