In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,265519,161919.0,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,...,0.076197,0.297537,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,7.32,0
1,180306,124477.0,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,...,0.038628,0.228197,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,2.99,0
2,42665,41191.0,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,...,-2.798352,0.109526,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,175.1,0
3,198724,132624.0,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,...,-0.13967,0.077013,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,6.1,0
4,82326,59359.0,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,...,-0.243245,-0.173298,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,86.1,0


In [None]:
train_df.isnull().sum()

Unnamed: 0,0
id,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0


## Engenharia de features

In [None]:
def engenharia_de_features(df):
  df['Hour'] = (df['Time'] / 3600) % 24
  df['is_dawn'] = (df['Hour'] <= 6).astype(int)
  return df.drop(['Time'], axis=1)

In [None]:
train_df = engenharia_de_features(train_df)

train_df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Hour,is_dawn
0,265519,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,1.406819,...,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,7.32,0,20.9775,0
1,180306,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,0.045826,...,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,2.99,0,10.576944,0
2,42665,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,0.000772,...,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,175.1,0,11.441944,0
3,198724,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,-1.118447,...,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,6.1,0,12.84,0
4,82326,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,-0.044782,...,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,86.1,0,16.488611,0


## divisão treino e validação

In [None]:
# 3. Divisão treino e validacao

X = train_df.drop(columns=['Class', 'id'])
y = train_df['Class']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
print(f"Nulos no X: {X.isnull().sum().sum()}")
print(f"Nulos no y: {y.isnull().sum()}")

Nulos no X: 0
Nulos no y: 0


## Escalar dataset

In [None]:
scaler = RobustScaler()
cols_to_scale = [c for c in X_train.columns if c != 'is_dawn' and c != 'id']

In [None]:
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

X_val[cols_to_scale] = scaler.transform(X_val[cols_to_scale])

## Salvando objetos

In [None]:
# Salvar objetos para os próximos modelos
joblib.dump(scaler, 'scaler_final.joblib')
joblib.dump((X_train, y_train, X_val, y_val), 'dados_processados_dev.joblib')

['dados_processados_dev.joblib']

In [None]:
test_kaggle_bruto = pd.read_csv('test.csv')
test_proc = engenharia_de_features(test_kaggle_bruto)
test_proc[cols_to_scale] = scaler.transform(test_proc[cols_to_scale])

joblib.dump(test_proc, 'X_test_kaggle.joblib')

['X_test_kaggle.joblib']