In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import StandardScaler

In [2]:
# Current script directory
current_dir = Path.cwd()

# Path to sibling folder
data_file = current_dir.parent / 'data' / 'clean'/ 'train_clean.csv'
data_file_no_outlier = current_dir.parent / 'data' / 'clean'/ 'train_no_outlier.csv'

In [3]:
df = pd.read_csv(data_file)
df_no_outlier = pd.read_csv(data_file_no_outlier)

In [4]:
df.drop(df.columns[0], axis=1, inplace=True)

In [5]:
df_no_outlier.drop(columns="Unnamed: 0",axis=1, inplace=True)

In [6]:
# Features to scale
features = [col for col in df.columns if col not in ['BeatsPerMinute']]

# Original dataset
X = df[features]
y = df['BeatsPerMinute']

# Cleaned dataset
X_clean = df_no_outlier[features]
y_clean = df_no_outlier['BeatsPerMinute']

In [7]:
scaler = StandardScaler()

# Fit and transform original dataset
X_scaled = scaler.fit_transform(X)

# Fit and transform cleaned dataset separately
scaler_clean = StandardScaler()
X_clean_scaled = scaler_clean.fit_transform(X_clean)

In [8]:
X_scaled = pd.DataFrame(X_scaled, columns=features)
X_clean_scaled = pd.DataFrame(X_clean_scaled, columns=features)

In [9]:
X_scaled['BeatsPerMinute'] = y
X_clean_scaled['BeatsPerMinute'] = y_clean

In [10]:
# Define processed data folder
processed_dir = Path('data') / 'processed'

In [None]:
# dataset saving
X_scaled.to_csv(current_dir.parent / processed_dir / 'data_processed.csv')
X_clean_scaled.to_csv(current_dir.parent / processed_dir / 'data_clean_processed.csv')

In [None]:
# Scaler saving
joblib.dump(scaler, current_dir.parent  / processed_dir / 'scaler_original.pkl')
joblib.dump(scaler_clean, current_dir.parent / processed_dir / 'scaler_clean.pkl')