In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
df = pd.read_csv('data/raw_data.csv')

In [44]:
df.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0,0
1,966622,long_email,personalized,12,Sunday,UK,2,1,1
2,777221,long_email,personalized,11,Wednesday,US,2,0,0
3,493711,short_email,generic,6,Monday,UK,1,0,0
4,106887,long_email,generic,14,Monday,US,6,0,0


In [45]:
misleading_condition = (df['opened'] == 0) & (df['clicked'] == 1)

In [46]:
df = df[~misleading_condition].reset_index(drop=True)

In [47]:
df

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0,0
1,966622,long_email,personalized,12,Sunday,UK,2,1,1
2,777221,long_email,personalized,11,Wednesday,US,2,0,0
3,493711,short_email,generic,6,Monday,UK,1,0,0
4,106887,long_email,generic,14,Monday,US,6,0,0
...,...,...,...,...,...,...,...,...,...
99945,803504,short_email,personalized,4,Monday,US,5,0,0
99946,899722,long_email,personalized,7,Saturday,US,1,0,0
99947,449610,long_email,personalized,11,Saturday,UK,6,0,0
99948,72497,short_email,generic,10,Monday,UK,0,0,0


In [48]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from utils.custom_transformer import CustomTransformer, TransformerLambda
from sklearn.model_selection import train_test_split

In [49]:
df["hour"].unique()

array([ 2, 12, 11,  6, 14,  9,  8,  5, 23,  7, 10,  3,  4,  1, 13, 15, 17,
       16, 20, 18, 19, 22, 21, 24])

In [50]:
df["hour"] = df["hour"] - 1
df.rename(columns={"hour": "hour_sin"}, inplace=True)
df["hour_cos"] = df["hour_sin"]

In [51]:
ohe = OneHotEncoder(drop='first', sparse_output=False)
oe = OrdinalEncoder(categories=[["Monday", "Tuesday", "Wednesday","Thursday","Friday","Saturday","Sunday"]])
log_trf = TransformerLambda(func=np.log1p)
sc = StandardScaler()
hsine = TransformerLambda(func=lambda x: np.sin(2 * np.pi * x / 24))
hcos = TransformerLambda(func=lambda x: np.cos(2 * np.pi * x / 24))

In [52]:
ctf = CustomTransformer(
    transformers=[
        ("ohe", ohe, ["email_text", "email_version", "user_country"]),
        ("oe", oe, ["weekday"]),
        ("log_trf", log_trf, ["user_past_purchases"]),
        ("scale", sc, ["user_past_purchases"]),
        ("h_sine", hsine, ["hour_sin"]),
        ("h_cos", hcos, ["hour_cos"]),
    ]
)

In [53]:
X, y = df.drop(columns = ["email_id", "opened", "clicked"]), df[["opened", "clicked"]]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X_train = ctf.fit_transform(X_train)
X_test = ctf.transform(X_test)

In [56]:
X_train

Unnamed: 0,hour_sin,weekday,user_past_purchases,hour_cos,email_text_short_email,email_version_personalized,user_country_FR,user_country_UK,user_country_US
36546,0.500000,5.0,1.539891,-8.660254e-01,1.0,1.0,0.0,0.0,1.0
72451,0.258819,4.0,0.815620,-9.659258e-01,1.0,0.0,0.0,0.0,1.0
77105,1.000000,3.0,0.063641,6.123234e-17,0.0,1.0,0.0,0.0,1.0
7576,0.866025,0.0,-0.322929,-5.000000e-01,0.0,0.0,0.0,0.0,1.0
50926,0.500000,5.0,0.363488,-8.660254e-01,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
6265,-0.707107,5.0,0.063641,-7.071068e-01,1.0,1.0,0.0,0.0,1.0
54886,1.000000,6.0,-1.799179,6.123234e-17,1.0,1.0,0.0,0.0,1.0
76820,0.707107,6.0,-1.799179,-7.071068e-01,1.0,0.0,0.0,0.0,1.0
860,0.000000,5.0,0.063641,1.000000e+00,1.0,1.0,0.0,0.0,1.0


In [57]:
import os

In [58]:
os.makedirs('data/preprocessed', exist_ok=True)

In [59]:
X_train.to_csv('/home/kraten/Desktop/dev/sagar/data/preprocessed/X_train.csv', index=False)
X_test.to_csv('/home/kraten/Desktop/dev/sagar/data/preprocessed/X_test.csv', index=False)
y_train.to_csv('/home/kraten/Desktop/dev/sagar/data/preprocessed/y_train.csv', index=False)
y_test.to_csv('/home/kraten/Desktop/dev/sagar/data/preprocessed/y_test.csv', index=False)

In [60]:
del X_train
del X_test
del y_train
del y_test

In [61]:
df.columns

Index(['email_id', 'email_text', 'email_version', 'hour_sin', 'weekday',
       'user_country', 'user_past_purchases', 'opened', 'clicked', 'hour_cos'],
      dtype='object')