### Import libraries

In [177]:
import pandas as pd

pd.set_option('display.max_columns', None)

### Read datasets

In [178]:
train_df = pd.read_csv('data/train_set.csv')
valid_df = pd.read_csv('data/valid_set.csv')
vif_removed = pd.read_csv('data/vif_removed.csv')

In [179]:
# Remove variables with VIF > 15 and the target encoded categorical feature
vif_removed_vars = vif_removed['variables'].to_list()
train_df.drop(['state_target_encoded'] + vif_removed_vars, axis=1, inplace=True)
valid_df.drop(['state_target_encoded'] + vif_removed_vars, axis=1, inplace=True)

### Feature engineering

##### Interaction features

In [180]:
def convert_minutes_to_hours(df):
    cols = [col for col in df.columns if 'minutes' in col]
    
    for col in cols:
        df[col + '_in_hours'] = df[col] / 60.0

    return df

In [181]:
dfs = [train_df, valid_df]
for df in dfs:
    # Total amount of time spent
    df['total_minutes'] = df['total_day_minutes'] + df['total_eve_minutes']
    # Examine if the customer is more active during the day or evening
    df['day_eve_ratio'] = df['total_day_minutes'] / df['total_eve_minutes']
    # Ratio of international call time to the total call time
    df['intl_ratio'] = df['total_intl_minutes'] / df['total_minutes']
    # Identify if having both plans or a specific plan impacts the 'payment_delay'
    df['both_plans'] = df.apply(lambda row: 1 if row['international_plan'] == 1 and row['voice_mail_plan'] == 1 else 0, axis=1)
    # Identify whether having a plan affects the number of customer service calls
    df['service_calls_with_intl_plan'] = df['number_customer_service_calls'] * df['international_plan']
    df['service_calls_with_vmail_plan'] = df['number_customer_service_calls'] * df['voice_mail_plan']
    # Convert minutes to hours
    df = convert_minutes_to_hours(df)