# Data Preprocessing Notebook

This notebook processes the raw datasets and applies necessary transformations,
such as cleaning, normalization, and feature engineering, before saving the
processed data.


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split


## Load Raw Data

In [None]:
# Define file paths
raw_data_path = '../raw/'
processed_data_path = '../processed/'

# Ensure the processed data folder exists
os.makedirs(processed_data_path, exist_ok=True)

# Load datasets
datasets = {}
for file in os.listdir(raw_data_path):
    if file.endswith('.csv'):
        datasets[file] = pd.read_csv(os.path.join(raw_data_path, file))
        print(f'Loaded {file} with {datasets[file].shape[0]} rows and {datasets[file].shape[1]} columns')

## Data Cleaning and Preprocessing

In [None]:
def preprocess_data(df):
    # Drop duplicates
    df = df.drop_duplicates()
    
    # Handle missing values
    df = df.fillna(method='ffill').fillna(method='bfill')
    
    # Convert text columns to lowercase
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower().str.strip()
    
    return df

# Apply preprocessing
for name, df in datasets.items():
    datasets[name] = preprocess_data(df)
    print(f'Preprocessed {name}')

## Train-Test Split

In [None]:
# Split datasets into training and testing sets
train_test_split_ratio = 0.8
for name, df in datasets.items():
    train, test = train_test_split(df, test_size=1-train_test_split_ratio, random_state=42)
    train.to_csv(os.path.join(processed_data_path, f'train_{name}'), index=False)
    test.to_csv(os.path.join(processed_data_path, f'test_{name}'), index=False)
    print(f'Saved train and test datasets for {name}')