# Data Preprocessing & Feature Engineering

This notebook handles data cleaning, text preprocessing, feature extraction, and handling class imbalances.

In [2]:
# ! pip install unidecode

In [3]:
! pip install imblearn



In [4]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
import unidecode
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

## Download necessary NLTK resources

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nuno_Moreira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nuno_Moreira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Nuno_Moreira\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

## Load the processed dataset

In [6]:
df = pd.read_csv('b2w_reviews_processed.csv')

  df = pd.read_csv('b2w_reviews_processed.csv')


## Define text preprocessing function for Portuguese text

In [7]:
def preprocess_portuguese_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove accents
    text = unidecode.unidecode(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('portuguese'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = RSLPStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a string
    return ' '.join(tokens)

## Apply preprocessing to review title and text

In [8]:
df['processed_title'] = df['review_title'].apply(preprocess_portuguese_text)
df['processed_text'] = df['review_text'].apply(preprocess_portuguese_text)

## Combine title and text (with title given more weight)

In [9]:
df['processed_combined'] = df['processed_title'] + ' ' + df['processed_text']

## Create additional features

## 1. Review length features

In [10]:
df['title_length'] = df['review_title'].apply(lambda x: len(str(x)))
df['text_length'] = df['review_text'].apply(lambda x: len(str(x)))
df['title_word_count'] = df['review_title'].apply(lambda x: len(str(x).split()))
df['text_word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

## 2. Capitalization features (might indicate emphasis or strong emotion)

In [11]:
df['caps_ratio'] = df['review_text'].apply(
    lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0
)

## 3. Punctuation features

In [12]:
def count_exclamations(text):
    if not isinstance(text, str):
        return 0
    return text.count('!')

def count_question_marks(text):
    if not isinstance(text, str):
        return 0
    return text.count('?')

df['exclamation_count'] = df['review_text'].apply(count_exclamations)
df['question_count'] = df['review_text'].apply(count_question_marks)

## 4. Word-based sentiment features (using common positive/negative words in Portuguese)

## This is a simple approach - ideally, you would use a proper sentiment lexicon for Portuguese

## Sample positive and negative words in Portuguese (expand this list)

In [13]:
positive_words = ['bom', 'otimo', 'excelente', 'perfeito', 'adorei', 'gostei', 'recomendo', 'maravilhoso']
negative_words = ['ruim', 'pessimo', 'horrivel', 'terrivel', 'detestei', 'nao', 'problema', 'defeito']

## Count occurrences of positive and negative words

In [14]:
def count_sentiment_words(text, word_list):
    if not isinstance(text, str):
        return 0
    
    text = text.lower()
    count = 0
    for word in word_list:
        count += len(re.findall(r'\b' + word + r'\b', text))
    return count

df['positive_word_count'] = df['review_text'].apply(lambda x: count_sentiment_words(x, positive_words))
df['negative_word_count'] = df['review_text'].apply(lambda x: count_sentiment_words(x, negative_words))
df['sentiment_ratio'] = df.apply(
    lambda row: row['positive_word_count'] / (row['negative_word_count'] + 1), axis=1
)

## 5. Categorical features (one-hot encoding)

## For product categories

In [15]:
category_dummies = pd.get_dummies(df['site_category_lv1'], prefix='cat1')
df = pd.concat([df, category_dummies], axis=1)

## For reviewer gender

In [16]:
gender_dummies = pd.get_dummies(df['reviewer_gender'], prefix='gender')
df = pd.concat([df, gender_dummies], axis=1)

## 6. Text representation with TF-IDF

## Using unigrams and bigrams

In [17]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit to top 1000 features
    min_df=5,           # Ignore terms that appear in less than 5 documents
    max_df=0.7,         # Ignore terms that appear in more than 70% of documents
    ngram_range=(1, 2)  # Include unigrams and bigrams
)

## Fit and transform the processed combined text

In [18]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_combined'])

## Convert to DataFrame for later use

In [19]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

## Save the feature names for later use

In [20]:
feature_names = tfidf_vectorizer.get_feature_names_out()
pd.Series(feature_names).to_csv('tfidf_feature_names.csv', index=False)

## 7. Define the target variable (based on the task you're interested in)

## Options:

## a. Binary sentiment (positive vs negative)

In [21]:
df['sentiment_binary'] = df['overall_rating'].apply(lambda x: 1 if x >= 4 else 0)

## b. Three-class sentiment (positive, neutral, negative)

In [22]:
df['sentiment_3class'] = df['overall_rating'].apply(
    lambda x: 2 if x >= 4 else (1 if x == 3 else 0)
)

## c. Recommendation prediction (yes/no)

In [23]:
df['recommendation'] = df['recommend_to_a_friend'].apply(lambda x: 1 if x == 'Yes' else 0)

## d. Rating prediction (1-5)

In [24]:
df['rating'] = df['overall_rating']

## 8. Handle class imbalance (if needed, for classification tasks)

## Here's an example for binary sentiment classification

## Prepare the feature matrix for sentiment binary classification

In [25]:
X_numeric = df[[
    'title_length', 'text_length', 'title_word_count', 'text_word_count',
    'caps_ratio', 'exclamation_count', 'question_count',
    'positive_word_count', 'negative_word_count', 'sentiment_ratio'
]].values

## Scale numeric features

In [26]:
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

## Prepare for splitting data

In [27]:
X = np.hstack((X_numeric_scaled, tfidf_matrix.toarray()))
y = df['sentiment_binary'].values

## Split into train and test sets

In [32]:
# Fill NaN values in your dataframe
df['review_text'] = df['review_text'].fillna('')

# Apply TF-IDF vectorization with fewer features to save memory
# Note: We're using preprocess_portuguese_text directly, not as a lambda
#tfidf_vectorizer = TfidfVectorizer(max_features=500, preprocessor=preprocess_portuguese_text)
X_tfidf = tfidf_vectorizer.fit_transform(df['review_text'])

from sklearn.decomposition import TruncatedSVD
# Apply dimensionality reduction
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

print(f"Original TF-IDF shape: {X_tfidf.shape}")
print(f"Reduced shape: {X_reduced.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, df['overall_rating'], test_size=0.2, random_state=42
)

# Now try SMOTE on the reduced dataset
# Use SMOTE with not_majority strategy to avoid oversampling to fully balanced classes
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the distribution of classes after resampling
print("Class distribution after resampling:")
print(pd.Series(y_train_resampled).value_counts().sort_index())

print(f"Original training set shape: {X_train.shape}")
print(f"Resampled training set shape: {X_train_resampled.shape}")

Original TF-IDF shape: (132373, 1000)
Reduced shape: (132373, 100)




Class distribution after resampling:
overall_rating
1    38399
2    38399
3    38399
4    38399
5    38399
Name: count, dtype: int64
Original training set shape: (105898, 100)
Resampled training set shape: (191995, 100)


## Save the train and test data

In [33]:
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
np.save('X_train_resampled.npy', X_train_resampled)
np.save('y_train_resampled.npy', y_train_resampled)

## Save feature column names and their indices for later use

In [34]:
feature_columns = list(df[[
    'title_length', 'text_length', 'title_word_count', 'text_word_count',
    'caps_ratio', 'exclamation_count', 'question_count',
    'positive_word_count', 'negative_word_count', 'sentiment_ratio'
]].columns) + list(feature_names)

feature_indices = {feature: i for i, feature in enumerate(feature_columns)}
pd.DataFrame.from_dict(feature_indices, orient='index').to_csv('feature_indices.csv')

## Save the processed dataframe

In [35]:
df.to_csv('b2w_reviews_features.csv', index=False)

print("Feature engineering and preprocessing completed.")

Feature engineering and preprocessing completed.
