In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
from utils.data_preprocessing import ChangeDtypes, DataTreatment, FeatureEngineering

In [3]:
df = pd.read_csv('../data/df_raw.csv')

In [4]:
df_train, df_val = train_test_split(df, test_size=0.30, stratify=df[['treatment_flg', 'target']], random_state=42)

In [5]:
X = df_train.drop(columns=['treatment_flg', 'target'])
y = df_train['treatment_flg']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [7]:
X_val = df_val.drop(columns=['treatment_flg', 'target'])
y_val = df_val['treatment_flg']

In [8]:
pipeline = Pipeline([('change d types', ChangeDtypes()),
                     ('tratamento', DataTreatment()),
                     ('feature engineering', FeatureEngineering())])

In [9]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
X_val = pipeline.transform(X_val)

In [10]:
columns_robust = ['n_transactions', 'mean_product_quantity', 'n_stores',
       'n_products', 'regular_points_received', 'express_points_received',
       'regular_points_spent', 'express_points_spent', 'total_amount_spent', 'avg_ticket']
columns_standard = ['age', 'recency', 'issue_redeem_delay' ]
categorical = ['gender']

In [11]:
rs = RobustScaler()
X_train[columns_robust] = rs.fit_transform(X_train[columns_robust])
X_test[columns_robust] = rs.transform(X_test[columns_robust])
X_val[columns_robust] = rs.transform(X_val[columns_robust])

In [12]:
ss = StandardScaler()
X_train[columns_standard] = ss.fit_transform(X_train[columns_standard])
X_test[columns_standard] = ss.transform(X_test[columns_standard])
X_val[columns_standard] = ss.transform(X_val[columns_standard])

In [13]:
ohe = OneHotEncoder(drop='first', sparse=False)
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train['gender'].to_numpy().reshape(-1, 1)), columns=ohe.get_feature_names())
X_train = pd.merge(X_train, X_train_ohe,  left_index=True, right_index=True)
X_train= X_train.drop(columns=['gender'])
X_test_ohe = pd.DataFrame(ohe.transform(X_test['gender'].to_numpy().reshape(-1, 1)), columns=ohe.get_feature_names())
X_test = pd.merge(X_test, X_test_ohe,  left_index=True, right_index=True)
X_test= X_test.drop(columns=['gender'])
X_val_ohe = pd.DataFrame(ohe.transform(X_val['gender'].to_numpy().reshape(-1, 1)), columns=ohe.get_feature_names())
X_val = pd.merge(X_val, X_val_ohe,  left_index=True, right_index=True)
X_val= X_val.drop(columns=['gender'])

In [13]:
df.to_csv('../data/df_processed.csv')