In [None]:
# Import required libraries
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computations
import re  # Regular expressions for text preprocessing
from nltk.corpus import stopwords  # Stopwords list for filtering out common words
from nltk.stem import WordNetLemmatizer  # Lemmatizer to reduce words to their base form
import nltk  # Natural Language Toolkit for text processing

In [None]:
# Load the dataset from the specified file path
file_path = 'data/fake_reviews_dataset.csv'
reviews_dataset = pd.read_csv(file_path)

# Confirm successful loading and display the first few rows
print("Dataset loaded successfully.")
print(reviews_dataset.head())  # Preview the dataset structure


In [None]:
# Rename the 'text_' column to 'text'
if 'text_' in reviews_dataset.columns:
    reviews_dataset.rename(columns={'text_': 'text'}, inplace=True)
    print("Renamed 'text_' column to 'text'.")

In [None]:
# Cleans the specified text column: Removes NaN values, Converts values to strings, Removes empty/whitespace-only strings
def clean_text_column(df, column_name):
    initial_rows = df.shape[0]
    
    # Drop rows with NaN in the specified column
    df = df.dropna(subset=[column_name])
    rows_dropped_na = initial_rows - df.shape[0]
    print(f"Rows dropped due to NaN in '{column_name}': {rows_dropped_na}")

    # Convert all values in the column to strings and strip whitespace
    df[column_name] = df[column_name].astype(str).str.strip()
    
    # Drop rows with empty or whitespace-only strings
    df = df[df[column_name] != '']
    
    return df

# Apply the cleaning process to the 'text' column
reviews_dataset = clean_text_column(reviews_dataset, 'text')


In [None]:
# Track initial row count
initial_rows = reviews_dataset.shape[0]

# Remove exact duplicate rows
reviews_dataset.drop_duplicates(inplace=True)

# Display number of duplicates removed
print(f"Rows dropped due to duplicates: {initial_rows - reviews_dataset.shape[0]}")


In [None]:
# Convert labels to numeric (CG = 1, OR = 0)
reviews_dataset['label'] = reviews_dataset['label'].map({'CG': 1, 'OR': 0})

# Convert categories to numeric values using a mapping dictionary
category_mapping = {
    'Home_and_Kitchen_5': 0,
    'Tools_and_Home_Improvement_5': 1,
    'Movies_and_TV_5': 2,
    'Electronics_5': 3,
    'Sports_and_Outdoors_5': 4,
    'Clothing_Shoes_and_Jewelry_5': 5,
    'Toys_and_Games_5': 6,
    'Books_5': 7,
    'Kindle_Store_5': 8,
    'Pet_Supplies_5': 9
}
reviews_dataset['category'] = reviews_dataset['category'].map(category_mapping)

# Drop rows with NaN values after mapping
reviews_dataset = reviews_dataset.dropna(subset=['label', 'category'])
print("Labels and categories converted successfully.")

In [None]:
# Convert Date Column to Datetime
reviews_dataset['date'] = pd.to_datetime(reviews_dataset['date'], errors='coerce')
print("Date column converted successfully.")


In [None]:
# Text Preprocessing (Lemmatisation & Stopword Removal)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)   # Remove all non-alphabetic characters, keeping only letters and spaces
    tokens = text.lower().split()   # Convert text to lowercase and split into words (tokens)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]   # Apply lemmatization and remove stopwords
    return ' '.join(tokens)   # Reconstruct the cleaned text

# Apply preprocessing to the 'text' column and store results in 'cleaned_text'
reviews_dataset['cleaned_text'] = reviews_dataset['text'].apply(preprocess_text)
print("Text preprocessing completed.")

In [None]:
# Save the cleaned dataset to a new CSV file
reviews_dataset.to_csv('data/reviews_dataset_cleaned.csv', index=False)
print("Cleaned dataset saved to 'reviews_dataset_cleaned.csv'")

# Display column names in the final dataset
print("Columns in the cleaned dataset:", reviews_dataset.columns.tolist())
