In [4]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [5]:
import pandas as pd
import boto3
from imblearn.over_sampling import SMOTE
import os
 
# Load the preprocessed dataset from S3
s3 = boto3.client("s3")
bucket_name = "fr-detector"
 
# Download dataset from S3
s3.download_file(bucket_name, "train/X_train.csv", "X_train.csv")
s3.download_file(bucket_name, "train/y_train.csv", "y_train.csv")
 
# Load data
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")
 
# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Adjust sampling ratio if needed
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
 
# Save balanced dataset
os.makedirs("balanced_train", exist_ok=True)
X_resampled.to_csv("balanced_train/X_train.csv", index=False)
y_resampled.to_csv("balanced_train/y_train.csv", index=False)
 
# Upload balanced dataset back to S3
s3.upload_file("balanced_train/X_train.csv", bucket_name, "balanced_train/X_train.csv")
s3.upload_file("balanced_train/y_train.csv", bucket_name, "balanced_train/y_train.csv")
 
print("Balanced dataset uploaded to S3 successfully!")

Balanced dataset uploaded to S3 successfully!


In [8]:
X_resampled.shape
print(X_resampled.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


In [9]:
y_resampled.columns

Index(['Class'], dtype='object')

In [10]:
import pandas as pd
 
# Merge features and labels
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
 
# Check data types
print(df_resampled.dtypes)
 
# Check for missing values
print(df_resampled.isnull().sum())
 
# Display a few rows
df_resampled.head()

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99,0
1,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9,0
2,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99,0
3,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44,0
4,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76,0


In [11]:
import pandas as pd
 
# Load balanced dataset (if not already in memory)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
 
# Ensure column names are formatted properly
df_resampled.columns = df_resampled.columns.str.strip()
 
# Convert 'Class' column to integer type
df_resampled["Class"] = df_resampled["Class"].astype(int)
 
# Normalize 'Amount' column (optional but recommended)
df_resampled["Amount"] = (df_resampled["Amount"] - df_resampled["Amount"].mean()) / df_resampled["Amount"].std()
 
# Save processed dataset
df_resampled.to_csv("balanced_train/processed_train.csv", index=False)
print("Feature engineering completed and dataset saved!")

Feature engineering completed and dataset saved!
